Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -73,6 +73,10 @@ void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; +FunctionPass *createAMDGPULowerKernelArgumentsPass(); +void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &); +extern char &AMDGPULowerKernelArgumentsID; + ModulePass *createAMDGPULowerKernelAttributesPass(); void initializeAMDGPULowerKernelAttributesPass(PassRegistry &); extern char &AMDGPULowerKernelAttributesID; Index: lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -0,0 +1,267 @@ +//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass replaces accesses to kernel arguments with loads from +/// offsets from the kernarg base pointer. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" + +#define DEBUG_TYPE "amdgpu-lower-kernel-arguments" + +using namespace llvm; + +namespace { + +class AMDGPULowerKernelArguments : public FunctionPass{ +public: + static char ID; + + AMDGPULowerKernelArguments() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesAll(); + } +}; + +} // end anonymous namespace + +bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { + CallingConv::ID CC = F.getCallingConv(); + if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) + return false; + + auto &TPC = getAnalysis(); + + const TargetMachine &TM = TPC.getTM(); + const SISubtarget &ST = TM.getSubtarget(F); + LLVMContext &Ctx = F.getParent()->getContext(); + const DataLayout &DL = F.getParent()->getDataLayout(); + BasicBlock &EntryBlock = *F.begin(); + IRBuilder<> Builder(&*EntryBlock.begin()); + + SmallVector ArgTypes; + for (Argument &Arg : F.args()) { + Type *ArgTy = Arg.getType(); + unsigned Size = DL.getTypeStoreSizeInBits(ArgTy); + bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) && + !ST.isAmdHsaOS(); + + // Clover seems to always pad i8/i16 to i32, but doesn't properly align + // them? + // Make sure the struct elements have correct size and alignment for ext + // args. These seem to be padded up to 4-bytes but not correctly aligned. + ArgTypes.push_back( + IsExtArg ? ArrayType::get(ArgTy, 32 / Size) : Arg.getType()); + } + + StructType *ArgStructTy = StructType::create(Ctx, ArgTypes, F.getName()); + const StructLayout *Layout = DL.getStructLayout(ArgStructTy); + + // Minimum alignment for kern segment is 16. + unsigned KernArgBaseAlign = std::max(16u, DL.getABITypeAlignment(ArgStructTy)); + const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F); + + // FIXME: Alignment is broken broken with explicit arg offset.; + const uint64_t TotalKernArgSize = BaseOffset + + ST.getKernArgSegmentSize(F, DL.getTypeAllocSize(ArgStructTy)); + + CallInst *KernArgSegment = + Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr, + F.getName() + ".kernarg.segment"); + + KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); + KernArgSegment->addAttribute(AttributeList::ReturnIndex, + Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); + KernArgSegment->addAttribute(AttributeList::ReturnIndex, + Attribute::getWithAlignment(Ctx, KernArgBaseAlign)); + + Value *KernArgBase = KernArgSegment; + if (BaseOffset != 0) { + KernArgBase = Builder.CreateConstInBoundsGEP1_64(KernArgBase, BaseOffset); + KernArgBaseAlign = MinAlign(KernArgBaseAlign, BaseOffset); + } + + unsigned AS = KernArgSegment->getType()->getPointerAddressSpace(); + Value *CastStruct = Builder.CreateBitCast(KernArgBase, + ArgStructTy->getPointerTo(AS)); + for (Argument &Arg : F.args()) { + if (Arg.use_empty()) + continue; + + Type *ArgTy = Arg.getType(); + if (PointerType *PT = dyn_cast(ArgTy)) { + // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing + // modes on SI to know the high bits are 0 so pointer adds don't wrap. We + // can't represent this with range metadata because it's only allowed for + // integer types. + if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) + continue; + + // FIXME: We can replace this with equivalent alias.scope/noalias + // metadata, but this appears to be a lot of work. + if (Arg.hasNoAliasAttr()) + continue; + } + + VectorType *VT = dyn_cast(ArgTy); + bool IsV3 = VT && VT->getNumElements() == 3; + VectorType *V4Ty = nullptr; + + unsigned Size = DL.getTypeSizeInBits(ArgTy); + bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) && + !ST.isAmdHsaOS(); + int64_t EltOffset = Layout->getElementOffset(Arg.getArgNo()); + int64_t AlignDownOffset = alignDown(EltOffset, 4); + int64_t OffsetDiff = EltOffset - AlignDownOffset; + unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset); + + Value *ArgPtr; + if (Size < 32) { + // Since we don't have sub-dword scalar loads, avoid doing an extload by + // loading earlier than the argument address, and extracting the relevant + // bits. + // + // Additionally widen any sub-dword load to i32 even if suitably aligned, + // so that CSE between different argument loads works easily. + + ArgPtr = Builder.CreateConstGEP1_64(KernArgBase, AlignDownOffset); + ArgPtr = Builder.CreateBitCast( + ArgPtr, + Builder.getInt32Ty()->getPointerTo(AS), + Arg.getName() + ".kernarg.offset.align.down"); + } else { + ArgPtr = Builder.CreateStructGEP(CastStruct, Arg.getArgNo(), + Arg.getName() + ".kernarg.offset"); + } + + assert((!IsExtArg || !IsV3) && "incompatible situation"); + + + if (IsV3 && Size >= 32) { + V4Ty = VectorType::get(VT->getVectorElementType(), 4); + // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads + ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS)); + } + + LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign); + Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); + + MDBuilder MDB(Ctx); + + if (isa(ArgTy)) { + if (Arg.hasNonNullAttr()) + Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {})); + + uint64_t DerefBytes = Arg.getDereferenceableBytes(); + if (DerefBytes != 0) { + Load->setMetadata( + LLVMContext::MD_dereferenceable, + MDNode::get(Ctx, + MDB.createConstant( + ConstantInt::get(Builder.getInt64Ty(), DerefBytes)))); + } + + uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes(); + if (DerefOrNullBytes != 0) { + Load->setMetadata( + LLVMContext::MD_dereferenceable_or_null, + MDNode::get(Ctx, + MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), + DerefOrNullBytes)))); + } + + unsigned ParamAlign = Arg.getParamAlignment(); + if (ParamAlign != 0) { + Load->setMetadata( + LLVMContext::MD_align, + MDNode::get(Ctx, + MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), + ParamAlign)))); + } + } + + // TODO: Convert noalias arg to !noalias + + if (Size < 32) { + if (IsExtArg && OffsetDiff == 0) { + Type *I32Ty = Builder.getInt32Ty(); + bool IsSext = Arg.hasSExtAttr(); + Metadata *LowAndHigh[] = { + ConstantAsMetadata::get( + ConstantInt::get(I32Ty, IsSext ? minIntN(Size) : 0)), + ConstantAsMetadata::get( + ConstantInt::get(I32Ty, + IsSext ? maxIntN(Size) + 1 : maxUIntN(Size) + 1)) + }; + + Load->setMetadata(LLVMContext::MD_range, MDNode::get(Ctx, LowAndHigh)); + } + + Value *ExtractBits = OffsetDiff == 0 ? + Load : Builder.CreateLShr(Load, OffsetDiff * 8); + + IntegerType *ArgIntTy = Builder.getIntNTy(Size); + Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy); + Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy, + Arg.getName() + ".load"); + Arg.replaceAllUsesWith(NewVal); + } else if (IsV3) { + Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty), + {0, 1, 2}, + Arg.getName() + ".load"); + Arg.replaceAllUsesWith(Shuf); + } else { + Load->setName(Arg.getName() + ".load"); + Arg.replaceAllUsesWith(Load); + } + } + + return true; +} + +INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE, + "AMDGPU Lower Kernel Arguments", false, false) +INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", + false, false) + +char AMDGPULowerKernelArguments::ID = 0; + +FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() { + return new AMDGPULowerKernelArguments(); +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -130,6 +130,12 @@ cl::init(true), cl::Hidden); +static cl::opt EnableLowerKernelArguments( + "amdgpu-ir-lower-kernel-arguments", + cl::desc("Lower kernel argument loads in IR pass"), + cl::init(true), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -155,6 +161,7 @@ initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); + initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); @@ -669,6 +676,10 @@ } void AMDGPUPassConfig::addCodeGenPrepare() { + if (TM->getTargetTriple().getArch() == Triple::amdgcn && + EnableLowerKernelArguments) + addPass(createAMDGPULowerKernelArgumentsPass()); + TargetPassConfig::addCodeGenPrepare(); if (EnableLoadStoreVectorizer) Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -40,6 +40,7 @@ AMDGPULibCalls.cpp AMDGPULibFunc.cpp AMDGPULowerIntrinsics.cpp + AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp AMDGPUMachineCFGStructurizer.cpp AMDGPUMachineFunction.cpp Index: test/CodeGen/AMDGPU/GlobalISel/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/smrd.ll +++ test/CodeGen/AMDGPU/GlobalISel/smrd.ll @@ -9,11 +9,11 @@ ; GCN-LABEL: {{^}}smrd0: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 -define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { +define amdgpu_kernel void @smrd0(i32 addrspace(4)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 1 %1 = load i32, i32 addrspace(4)* %0 - store i32 %1, i32 addrspace(1)* %out + store i32 %1, i32 addrspace(1)* undef ret void } @@ -21,11 +21,11 @@ ; GCN-LABEL: {{^}}smrd1: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}} ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { +define amdgpu_kernel void @smrd1(i32 addrspace(4)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 255 %1 = load i32, i32 addrspace(4)* %0 - store i32 %1, i32 addrspace(1)* %out + store i32 %1, i32 addrspace(1)* undef ret void } @@ -36,11 +36,11 @@ ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 ; GCN: s_endpgm -define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { +define amdgpu_kernel void @smrd2(i32 addrspace(4)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 256 %1 = load i32, i32 addrspace(4)* %0 - store i32 %1, i32 addrspace(1)* %out + store i32 %1, i32 addrspace(1)* undef ret void } @@ -51,11 +51,11 @@ ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b ; TODO: Add VI checks ; XGCN: s_endpgm -define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { +define amdgpu_kernel void @smrd3(i32 addrspace(4)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296 ; 2 ^ 32 %1 = load i32, i32 addrspace(4)* %0 - store i32 %1, i32 addrspace(1)* %out + store i32 %1, i32 addrspace(1)* undef ret void } @@ -65,11 +65,11 @@ ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { +define amdgpu_kernel void @smrd4(i32 addrspace(4)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143 %1 = load i32, i32 addrspace(4)* %0 - store i32 %1, i32 addrspace(1)* %out + store i32 %1, i32 addrspace(1)* undef ret void } @@ -79,11 +79,11 @@ ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { +define amdgpu_kernel void @smrd5(i32 addrspace(4)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144 %1 = load i32, i32 addrspace(4)* %0 - store i32 %1, i32 addrspace(1)* %out + store i32 %1, i32 addrspace(1)* undef ret void } Index: test/CodeGen/AMDGPU/add_i64.ll =================================================================== --- test/CodeGen/AMDGPU/add_i64.ll +++ test/CodeGen/AMDGPU/add_i64.ll @@ -76,7 +76,7 @@ ; SI-NOT: addc ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; SI: buffer_store_dword [[VRESULT]], -define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i32, i64 %a, i32, i64 %b) { %add = add i64 %b, %a %trunc = trunc i64 %add to i32 store i32 %trunc, i32 addrspace(1)* %out, align 8 Index: test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs ; TRAP-HANDLER-ENABLE: NumSgprs: 60 -; TRAP-HANDLER-DISABLE: NumSgprs: 76 +; TRAP-HANDLER-DISABLE: NumSgprs: 78 define amdgpu_kernel void @amdhsa_trap_num_sgprs( i32 addrspace(1)* %out0, i32 %in0, i32 addrspace(1)* %out1, i32 %in1, Index: test/CodeGen/AMDGPU/and.ll =================================================================== --- test/CodeGen/AMDGPU/and.ll +++ test/CodeGen/AMDGPU/and.ll @@ -217,7 +217,7 @@ ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}} ; SI-NOT: and ; SI: buffer_store_dwordx2 -define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i32, i64 %a) { %and = and i64 %a, 1234567 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -235,7 +235,7 @@ ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) { +define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) { %shl.a = shl i64 %a, 1 %shl.b = shl i64 %b, 1 %and0 = and i64 %shl.a, 62 @@ -381,7 +381,7 @@ ; SI-NOT: and ; SI: s_add_u32 ; SI-NEXT: s_addc_u32 -define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) { +define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i32, i64 %b) { %shl = shl i64 %a, 1 %and = and i64 %shl, 64 %add = add i64 %and, %b Index: test/CodeGen/AMDGPU/ashr.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/ashr.v2i16.ll +++ test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -12,25 +12,17 @@ ; CIVI: s_load_dword [[LHS:s[0-9]+]] ; CIVI: s_load_dword [[RHS:s[0-9]+]] -; VI: s_ashr_i32 -; VI: s_ashr_i32 -; VI: s_sext_i32_i16 -; VI: s_sext_i32_i16 -; VI: s_ashr_i32 -; VI: s_ashr_i32 -; VI: s_lshl_b32 -; VI: s_and_b32 -; VI: s_or_b32 - -; CI: s_ashr_i32 -; CI: s_and_b32 -; CI: s_lshr_b32 -; CI: s_sext_i32_i16 -; CI: s_ashr_i32 -; CI: s_ashr_i32 -; CI: s_lshl_b32 -; CI: s_and_b32 -define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { +; CIVI-DAG: s_ashr_i32 +; CIVI-DAG: s_ashr_i32 +; CIVI-DAG: s_sext_i32_i16 +; CIVI-DAG: s_sext_i32_i16 +; CIVI-DAG: s_ashr_i32 +; CIVI-DAG: s_ashr_i32 +; CIVI-DAG: s_lshl_b32 +; CIVI: s_and_b32 +; CIVI: s_or_b32 + +define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, i32, <2 x i16> %lhs, i32, <2 x i16> %rhs) #0 { %result = ashr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll +++ test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll @@ -7,16 +7,16 @@ ; GFX9-NOT: m0 ; SICIVI-DAG: s_mov_b32 m0 -; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c +; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c +; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70 ; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 ; GCN: s_endpgm -define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind { +define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, [8 x i32], i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic %result = extractvalue { i32, i1 } %pair, 0 @@ -70,15 +70,15 @@ ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa +; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x12 ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28 +; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x48 ; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] ; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 ; GCN: s_endpgm -define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind { +define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic %result = extractvalue { i32, i1 } %pair, 0 Index: test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll @@ -0,0 +1,33 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s + +; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0 + +; ALL-LABEL: {{^}}max_9_sgprs: +; ALL: SGPRBlocks: 1 +; ALL: NumSGPRsForWavesPerEU: 9 +define amdgpu_kernel void @max_9_sgprs() #0 { + %one = load volatile i32, i32 addrspace(4)* undef + %two = load volatile i32, i32 addrspace(4)* undef + %three = load volatile i32, i32 addrspace(4)* undef + %four = load volatile i32, i32 addrspace(4)* undef + %five = load volatile i32, i32 addrspace(4)* undef + %six = load volatile i32, i32 addrspace(4)* undef + %seven = load volatile i32, i32 addrspace(4)* undef + %eight = load volatile i32, i32 addrspace(4)* undef + %nine = load volatile i32, i32 addrspace(4)* undef + %ten = load volatile i32, i32 addrspace(4)* undef + call void asm sideeffect "", "s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight) + store volatile i32 %one, i32 addrspace(1)* undef + store volatile i32 %two, i32 addrspace(1)* undef + store volatile i32 %three, i32 addrspace(1)* undef + store volatile i32 %four, i32 addrspace(1)* undef + store volatile i32 %five, i32 addrspace(1)* undef + store volatile i32 %six, i32 addrspace(1)* undef + store volatile i32 %seven, i32 addrspace(1)* undef + store volatile i32 %eight, i32 addrspace(1)* undef + store volatile i32 %nine, i32 addrspace(1)* undef + store volatile i32 %ten, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind "amdgpu-num-sgpr"="14" } Index: test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll =================================================================== --- test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -1,25 +1,37 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s ; If spilling to smem, additional registers are used for the resource ; descriptor. +; FIXME: Vectorization can increase required SGPR count beyond limit. +; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0 + ; ALL-LABEL: {{^}}max_9_sgprs: ; ALL: SGPRBlocks: 1 ; ALL: NumSGPRsForWavesPerEU: 9 -define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1, - - i32 addrspace(1)* %out2, - i32 addrspace(1)* %out3, - i32 addrspace(1)* %out4, - i32 addrspace(1)* %out5, - i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 { - store i32 %one, i32 addrspace(1)* %out1 - store i32 %two, i32 addrspace(1)* %out2 - store i32 %three, i32 addrspace(1)* %out3 - store i32 %four, i32 addrspace(1)* %out4 - store i32 %five, i32 addrspace(1)* %out5 +define amdgpu_kernel void @max_9_sgprs() #0 { + %one = load volatile i32, i32 addrspace(4)* undef + %two = load volatile i32, i32 addrspace(4)* undef + %three = load volatile i32, i32 addrspace(4)* undef + %four = load volatile i32, i32 addrspace(4)* undef + %five = load volatile i32, i32 addrspace(4)* undef + %six = load volatile i32, i32 addrspace(4)* undef + %seven = load volatile i32, i32 addrspace(4)* undef + %eight = load volatile i32, i32 addrspace(4)* undef + %nine = load volatile i32, i32 addrspace(4)* undef + %ten = load volatile i32, i32 addrspace(4)* undef + call void asm sideeffect "", "s,s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, i32 %nine) + store volatile i32 %one, i32 addrspace(1)* undef + store volatile i32 %two, i32 addrspace(1)* undef + store volatile i32 %three, i32 addrspace(1)* undef + store volatile i32 %four, i32 addrspace(1)* undef + store volatile i32 %five, i32 addrspace(1)* undef + store volatile i32 %six, i32 addrspace(1)* undef + store volatile i32 %seven, i32 addrspace(1)* undef + store volatile i32 %eight, i32 addrspace(1)* undef + store volatile i32 %nine, i32 addrspace(1)* undef + store volatile i32 %ten, i32 addrspace(1)* undef ret void } Index: test/CodeGen/AMDGPU/basic-branch.ll =================================================================== --- test/CodeGen/AMDGPU/basic-branch.ll +++ test/CodeGen/AMDGPU/basic-branch.ll @@ -29,7 +29,8 @@ ; GCN-LABEL: {{^}}test_brcc_i1: ; GCN: s_load_dword [[VAL:s[0-9]+]] -; GCNNOOPT: s_and_b32 s{{[0-9]+}}, 1, [[VAL]] +; GCNNOOPT: s_mov_b32 [[ONE:s[0-9]+]], 1{{$}} +; GCNNOOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], [[ONE]] ; GCNOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], 1 ; GCN: s_cmp_eq_u32 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]] Index: test/CodeGen/AMDGPU/bfe-patterns.ll =================================================================== --- test/CodeGen/AMDGPU/bfe-patterns.ll +++ test/CodeGen/AMDGPU/bfe-patterns.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}v_ubfe_sub_i32: ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]] @@ -48,10 +48,9 @@ } ; GCN-LABEL: {{^}}s_ubfe_sub_i32: -; GCN: s_load_dword [[SRC:s[0-9]+]] -; GCN: s_load_dword [[WIDTH:s[0-9]+]] -; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]] -; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] +; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}} +; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]] +; GCN: v_bfe_u32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]] define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x @@ -63,11 +62,10 @@ } ; GCN-LABEL: {{^}}s_ubfe_sub_multi_use_shl_i32: -; GCN: s_load_dword [[SRC:s[0-9]+]] -; GCN: s_load_dword [[WIDTH:s[0-9]+]] -; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]] -; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]] -; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]] +; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}} +; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]] +; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]] +; GCN: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]] define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x @@ -126,10 +124,9 @@ } ; GCN-LABEL: {{^}}s_sbfe_sub_i32: -; GCN: s_load_dword [[SRC:s[0-9]+]] -; GCN: s_load_dword [[WIDTH:s[0-9]+]] -; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]] -; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] +; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}} +; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]] +; GCN: v_bfe_i32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]] define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x @@ -141,11 +138,10 @@ } ; GCN-LABEL: {{^}}s_sbfe_sub_multi_use_shl_i32: -; GCN: s_load_dword [[SRC:s[0-9]+]] -; GCN: s_load_dword [[WIDTH:s[0-9]+]] -; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]] -; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]] -; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]] +; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}} +; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]] +; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]] +; GCN: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]] define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x Index: test/CodeGen/AMDGPU/bfi_int.ll =================================================================== --- test/CodeGen/AMDGPU/bfi_int.ll +++ test/CodeGen/AMDGPU/bfi_int.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck -check-prefixes=R600,FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s ; BFI_INT Definition pattern from ISA docs ; (y & x) | (z & ~x) @@ -119,10 +119,10 @@ ; FUNC-LABEL: {{^}}s_bitselect_i64_pat_0: ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s -; GCN: v_bfi_b32 -; GCN: v_mov_b32_e32 v{{[0-9]+}}, s ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s -; GCN: v_bfi_b32 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN-DAG: v_bfi_b32 +; GCN-DAG: v_bfi_b32 define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { %and0 = and i64 %a, %b %not.a = xor i64 %a, -1 @@ -136,10 +136,10 @@ ; FUNC-LABEL: {{^}}s_bitselect_i64_pat_1: ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s -; GCN-DAG: v_bfi_b32 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s -; GCN: v_mov_b32_e32 v{{[0-9]+}}, s -; GCN: v_bfi_b32 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN-DAG: v_bfi_b32 +; GCN-DAG: v_bfi_b32 define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { %xor.0 = xor i64 %a, %mask %and = and i64 %xor.0, %b @@ -155,8 +155,8 @@ ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s ; GCN-DAG: v_bfi_b32 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s -; GCN: v_mov_b32_e32 v{{[0-9]+}}, s -; GCN: v_bfi_b32 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN-DAG: v_bfi_b32 define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { %xor.0 = xor i64 %a, %mask %and = and i64 %xor.0, %b Index: test/CodeGen/AMDGPU/br_cc.f16.ll =================================================================== --- test/CodeGen/AMDGPU/br_cc.f16.ll +++ test/CodeGen/AMDGPU/br_cc.f16.ll @@ -11,22 +11,32 @@ ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; GCN: s_cbranch_vccnz -; GCN: one{{$}} -; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]] -; GCN: buffer_store_short -; GCN: s_endpgm +; SI: one{{$}} +; SI: v_cvt_f16_f32_e32 v[[CVT:[0-9]+]], v[[A_F32]] -; GCN: two{{$}} -; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]] -; GCN: buffer_store_short v[[B_F16]] -; GCN: s_endpgm +; SI: two{{$}} +; SI: v_cvt_f16_f32_e32 v[[CVT]], v[[B_F32]] + +; SI: one{{$}} +; SI: buffer_store_short v[[CVT]] +; SI: s_endpgm + + + +; VI: one{{$}} +; VI: buffer_store_short v[[A_F16]] +; VI: s_endpgm + +; VI: two{{$}} +; VI: buffer_store_short v[[B_F16]] +; VI: s_endpgm define amdgpu_kernel void @br_cc_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %fcmp = fcmp olt half %a.val, %b.val br i1 %fcmp, label %one, label %two Index: test/CodeGen/AMDGPU/branch-relaxation.ll =================================================================== --- test/CodeGen/AMDGPU/branch-relaxation.ll +++ test/CodeGen/AMDGPU/branch-relaxation.ll @@ -490,7 +490,7 @@ ; GCN-LABEL: {{^}}long_branch_hang: ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6 -; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}} +; GCN: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}} ; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]] ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: Index: test/CodeGen/AMDGPU/code-object-v3.ll =================================================================== --- test/CodeGen/AMDGPU/code-object-v3.ll +++ test/CodeGen/AMDGPU/code-object-v3.ll @@ -9,7 +9,7 @@ ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel @@ -23,7 +23,7 @@ ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -123,7 +123,7 @@ } ; FUNC-LABEL: {{^}}s_ctlz_i64: -; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}} ; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]] ; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32 @@ -133,7 +133,7 @@ ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]] ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}} -define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind { +define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind { %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) store i64 %ctlz, i64 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -98,7 +98,7 @@ } ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64: -; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}} ; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]] ; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32 @@ -108,7 +108,7 @@ ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]] ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}} -define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind { +define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind { %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) store i64 %ctlz, i64 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/ctpop.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop.ll +++ test/CodeGen/AMDGPU/ctpop.ll @@ -305,14 +305,14 @@ ; but there are some cases when the should be allowed. ; FUNC-LABEL: {{^}}ctpop_i32_in_br: -; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd -; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34 +; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x16 +; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x58 ; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT -define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) { +define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, [8 x i32], i32 %cond) { entry: %tmp0 = icmp eq i32 %cond, 0 br i1 %tmp0, label %if, label %else Index: test/CodeGen/AMDGPU/ctpop16.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop16.ll +++ test/CodeGen/AMDGPU/ctpop16.ll @@ -308,7 +308,9 @@ ; FUNC-LABEL: {{^}}ctpop_i16_in_br: ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd ; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34 -; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] + +; GCN: s_and_b32 [[CTPOP_ARG:s[0-9]+]], [[VAL]], 0xffff +; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[CTPOP_ARG]] ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]] ; GCN: buffer_store_short [[RESULT]], ; GCN: s_endpgm Index: test/CodeGen/AMDGPU/ctpop64.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop64.ll +++ test/CodeGen/AMDGPU/ctpop64.ll @@ -13,13 +13,13 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone ; FUNC-LABEL: {{^}}s_ctpop_i64: -; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]] ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; GCN: buffer_store_dword [[VRESULT]], ; GCN: s_endpgm -define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind { +define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind { %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone %truncctpop = trunc i64 %ctpop to i32 store i32 %truncctpop, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/extract_vector_elt-f16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -58,11 +58,8 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v3f16: -; SI: s_load_dword s -; SI: s_load_dword s - -; GFX89: s_load_dwordx2 -; GFX89: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 ; GCN: buffer_store_short ; GCN: buffer_store_short @@ -78,8 +75,8 @@ ; FIXME: Why sometimes vector shift? ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16: ; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s +; SI: s_load_dwordx2 s +; SI: s_load_dwordx2 s ; GFX89: s_load_dwordx2 s ; GFX89: s_load_dwordx2 s @@ -87,9 +84,7 @@ ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 -; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v - -; SI: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; GCN: {{buffer|global}}_store_short define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 { Index: test/CodeGen/AMDGPU/extract_vector_elt-i16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -27,7 +27,7 @@ ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] ; GCN: buffer_store_short [[VELT1]] ; GCN: ScratchSize: 0 -define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %idx) #0 { +define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt = extractelement <2 x i16> %vec, i32 %idx store i16 %elt, i16 addrspace(1)* %out, align 2 @@ -58,12 +58,8 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v3i16: -; SI: s_load_dword s -; SI: s_load_dwordx2 s -; SI: s_load_dword s - -; GFX89: s_load_dwordx2 -; GFX89: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 ; GCN-NOT: {{buffer|flat|global}}_load @@ -79,8 +75,7 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v4i16: -; SI: s_load_dword s -; SI: s_load_dword s +; SI: s_load_dwordx2 ; SI: buffer_store_short ; SI: buffer_store_short @@ -100,12 +95,12 @@ ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16: ; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s +; SI: s_load_dwordx2 s +; SI: s_load_dwordx2 s -; GFX89-DAG: s_load_dwordx2 -; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x2c -; GFX89-DAG: s_load_dword s +; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x24 +; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x4c +; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[0:1], 0x54 ; GCN-NOT: {{buffer|flat|global}} @@ -113,17 +108,13 @@ ; SICI: buffer_store_short ; SICI: buffer_store_short -; SICI: buffer_load_ushort -; SICI: buffer_store_short - ; GFX9-NOT: s_pack_ll_b32_b16 ; GFX9-NOT: s_pack_lh_b32_b16 ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 -; GFX89: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LOAD0]]:[[LOAD1]]{{\]}}, s{{[0-9]+}} - +; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s ; GCN: {{buffer|global}}_store_short -define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 { +define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, [8 x i32], <3 x i16> %foo, i32 %idx) #0 { %p0 = extractelement <3 x i16> %foo, i32 %idx %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 %p0, i16 addrspace(1)* %out Index: test/CodeGen/AMDGPU/extract_vector_elt-i8.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}extract_vector_elt_v1i8: ; GCN: s_load_dword [[LOAD:s[0-9]+]] @@ -14,7 +14,8 @@ ; GCN-LABEL: {{^}}extract_vector_elt_v2i8: ; GCN: s_load_dword s ; GCN-NOT: {{flat|buffer|global}} -; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 +; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 +; VI: v_lshrrev_b16_e64 v{{[0-9]+}}, 8, s{{[0-9]+}} ; GCN-NOT: {{flat|buffer|global}} ; GCN: buffer_store_byte ; GCN: buffer_store_byte @@ -22,8 +23,8 @@ %p0 = extractelement <2 x i8> %foo, i32 0 %p1 = extractelement <2 x i8> %foo, i32 1 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - store i8 %p1, i8 addrspace(1)* %out - store i8 %p0, i8 addrspace(1)* %out1 + store volatile i8 %p1, i8 addrspace(1)* %out + store volatile i8 %p0, i8 addrspace(1)* %out1 ret void } @@ -38,8 +39,8 @@ %p0 = extractelement <3 x i8> %foo, i32 0 %p1 = extractelement <3 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - store i8 %p1, i8 addrspace(1)* %out - store i8 %p0, i8 addrspace(1)* %out1 + store volatile i8 %p1, i8 addrspace(1)* %out + store volatile i8 %p0, i8 addrspace(1)* %out1 ret void } @@ -54,24 +55,24 @@ %p0 = extractelement <4 x i8> %foo, i32 0 %p1 = extractelement <4 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - store i8 %p1, i8 addrspace(1)* %out - store i8 %p0, i8 addrspace(1)* %out1 + store volatile i8 %p1, i8 addrspace(1)* %out + store volatile i8 %p0, i8 addrspace(1)* %out1 ret void } ; GCN-LABEL: {{^}}extract_vector_elt_v8i8: +; GCN-NOT: {{s|flat|buffer|global}}_load ; GCN: s_load_dword [[VAL:s[0-9]+]] -; GCN-NOT: {{flat|buffer|global}} +; GCN-NOT: {{s|flat|buffer|global}}_load ; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16 -; GCN-NOT: {{flat|buffer|global}} +; GCN-NOT: {{s|flat|buffer|global}}_load ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 { %p0 = extractelement <8 x i8> %foo, i32 0 %p1 = extractelement <8 x i8> %foo, i32 2 - %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - store i8 %p1, i8 addrspace(1)* %out - store i8 %p0, i8 addrspace(1)* %out1 + store volatile i8 %p1, i8 addrspace(1)* null + store volatile i8 %p0, i8 addrspace(1)* null ret void } @@ -87,25 +88,25 @@ %p0 = extractelement <16 x i8> %foo, i32 0 %p1 = extractelement <16 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - store i8 %p1, i8 addrspace(1)* %out - store i8 %p0, i8 addrspace(1)* %out1 + store volatile i8 %p1, i8 addrspace(1)* %out + store volatile i8 %p0, i8 addrspace(1)* %out1 ret void } ; GCN-LABEL: {{^}}extract_vector_elt_v32i8: -; GCN: s_load_dword [[LOAD0:s[0-9]+]] -; GCN-NOT: {{flat|buffer|global}} -; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16 -; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]] +; GCN-NOT: {{s|flat|buffer|global}}_load +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN-NOT: {{s|flat|buffer|global}}_load +; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[VAL]], 16 +; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] ; GCN: buffer_store_byte [[V_ELT2]] ; GCN: buffer_store_byte [[V_LOAD0]] -define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 { %p0 = extractelement <32 x i8> %foo, i32 0 %p1 = extractelement <32 x i8> %foo, i32 2 - %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - store i8 %p1, i8 addrspace(1)* %out - store i8 %p0, i8 addrspace(1)* %out1 + store volatile i8 %p1, i8 addrspace(1)* null + store volatile i8 %p0, i8 addrspace(1)* null ret void } @@ -121,8 +122,8 @@ %p0 = extractelement <64 x i8> %foo, i32 0 %p1 = extractelement <64 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - store i8 %p1, i8 addrspace(1)* %out - store i8 %p0, i8 addrspace(1)* %out1 + store volatile i8 %p1, i8 addrspace(1)* %out + store volatile i8 %p0, i8 addrspace(1)* %out1 ret void } @@ -132,42 +133,36 @@ ; isTypeDesirableForOp in SimplifyDemandedBits ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8: -; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c -; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 +; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c ; VI-NOT: {{flat|buffer|global}} -; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8 -; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]] -; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}} -; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[ELT0]], [[ELT2]] +; VI-DAG: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[BUILD_VEC]] -; VI: buffer_store_byte [[EXTRACT]] -define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo, i32 %idx) #0 { +; VI: v_lshrrev_b16_e32 [[ELT:v[0-9]+]], [[SCALED_IDX]], [[V_LOAD]] +; VI: buffer_store_byte [[ELT]] +define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 { %elt = extractelement <2 x i8> %foo, i32 %idx - store i8 %elt, i8 addrspace(1)* %out + store volatile i8 %elt, i8 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8: -; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c -; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 +; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c ; VI-NOT: {{flat|buffer|global}} -; VI: s_lshr_b32 [[ELT12:s[0-9]+]], [[LOAD]], 8 -; VI: v_lshlrev_b16_e64 [[ELT1:v[0-9]+]], 8, [[ELT12]] -; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}} -; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[ELT0]], [[ELT1]] ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI: v_lshrrev_b32_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[VEC3]] -; VI: buffer_store_byte [[EXTRACT]] -define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 { +; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]] +; VI: v_mov_b32_e32 [[V_ELT:v[0-9]+]], [[ELT]] +; VI: buffer_store_byte [[V_ELT]] +define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 { %p0 = extractelement <3 x i8> %foo, i32 %idx %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - store i8 %p0, i8 addrspace(1)* %out + store volatile i8 %p0, i8 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34 +; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x30 ; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 @@ -175,16 +170,16 @@ ; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]] ; VI: buffer_store_byte [[V_EXTRACT]] -define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 { +define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 { %vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr %p0 = extractelement <4 x i8> %vec, i32 %idx %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - store i8 %p0, i8 addrspace(1)* %out + store volatile i8 %p0, i8 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34 +; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x10 ; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 @@ -195,7 +190,7 @@ %vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr %p0 = extractelement <8 x i8> %vec, i32 %idx %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - store i8 %p0, i8 addrspace(1)* %out + store volatile i8 %p0, i8 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f16.ll +++ test/CodeGen/AMDGPU/fabs.f16.ll @@ -39,9 +39,9 @@ } ; GCN-LABEL: {{^}}s_fabs_v4f16: -; CI: s_load_dword s[[LO:[0-9]+]] -; CI: s_load_dword s[[HI:[0-9]+]] +; CI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2 ; GFX89: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 + ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff ; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]] ; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]] @@ -54,7 +54,7 @@ ; GCN-LABEL: {{^}}fabs_fold_f16: ; GCN: s_load_dword [[IN0:s[0-9]+]] -; GCN: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16 +; GCN-DAG: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16 ; CI-DAG: v_cvt_f32_f16_e64 [[CVT0:v[0-9]+]], |[[IN0]]| ; CI-DAG: v_cvt_f32_f16_e32 [[ABS_CVT1:v[0-9]+]], [[IN1]] @@ -62,6 +62,7 @@ ; CI-DAG: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]] +; GFX89-NOT: and ; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]] ; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]] ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] Index: test/CodeGen/AMDGPU/fabs.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f64.ll +++ test/CodeGen/AMDGPU/fabs.f64.ll @@ -53,11 +53,11 @@ } ; SI-LABEL: {{^}}fabs_fold_f64: -; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; SI-NOT: and ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} ; SI: s_endpgm -define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { +define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, [8 x i32], double %in0, [8 x i32], double %in1) { %fabs = call double @llvm.fabs.f64(double %in0) %fmul = fmul double %fabs, %in1 store double %fmul, double addrspace(1)* %out @@ -65,11 +65,11 @@ } ; SI-LABEL: {{^}}fabs_fn_fold_f64: -; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; SI-NOT: and ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} ; SI: s_endpgm -define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { +define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, [8 x i32], double %in0, [8 x i32], double %in1) { %fabs = call double @fabs(double %in0) %fmul = fmul double %fabs, %in1 store double %fmul, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/fabs.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.ll +++ test/CodeGen/AMDGPU/fabs.ll @@ -70,10 +70,11 @@ } ; GCN-LABEL: {{^}}fabs_fn_fold: -; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb -; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb +; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN-NOT: and -; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}} +; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]] +; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]] define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) { %fabs = call float @fabs(float %in0) %fmul = fmul float %fabs, %in1 @@ -82,10 +83,11 @@ } ; FUNC-LABEL: {{^}}fabs_fold: -; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb -; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb +; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN-NOT: and -; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}} +; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]] +; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]] define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) { %fabs = call float @llvm.fabs.f32(float %in0) %fmul = fmul float %fabs, %in1 Index: test/CodeGen/AMDGPU/fadd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fadd.f16.ll +++ test/CodeGen/AMDGPU/fadd.f16.ll @@ -16,8 +16,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fadd half %a.val, %b.val store half %r.val, half addrspace(1)* %r ret void @@ -65,10 +65,10 @@ ; VI: flat_load_dword v[[B_V2_F16:[0-9]+]] ; VI: flat_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] @@ -102,13 +102,13 @@ ; GCN-LABEL: {{^}}fadd_v2f16_imm_a: ; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] @@ -133,13 +133,13 @@ ; GCN-LABEL: {{^}}fadd_v2f16_imm_b: ; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] Index: test/CodeGen/AMDGPU/fcmp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcmp.f16.ll +++ test/CodeGen/AMDGPU/fcmp.f16.ll @@ -16,8 +16,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp olt half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r @@ -42,8 +42,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %a.abs = call half @llvm.fabs.f16(half %a.val) %b.abs = call half @llvm.fabs.f16(half %b.val) %r.val = fcmp olt half %a.abs, %b.abs @@ -67,8 +67,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp oeq half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r @@ -90,8 +90,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp ole half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r @@ -113,8 +113,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp ogt half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r @@ -136,8 +136,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp one half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r @@ -159,8 +159,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp oge half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r @@ -182,8 +182,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp ord half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r @@ -205,8 +205,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp uno half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r @@ -228,8 +228,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp ult half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r @@ -251,8 +251,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp ueq half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r @@ -274,8 +274,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp ule half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r @@ -297,8 +297,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp ugt half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r @@ -320,8 +320,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp une half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r @@ -343,8 +343,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fcmp uge half %a.val, %b.val %r.val.sext = sext i1 %r.val to i32 store i32 %r.val.sext, i32 addrspace(1)* %r Index: test/CodeGen/AMDGPU/fcopysign.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcopysign.f16.ll +++ test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -30,8 +30,8 @@ half addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { entry: - %mag = load half, half addrspace(1)* %arg_mag - %sign = load half, half addrspace(1)* %arg_sign + %mag = load volatile half, half addrspace(1)* %arg_mag + %sign = load volatile half, half addrspace(1)* %arg_sign %out = call half @llvm.copysign.f16(half %mag, half %sign) store half %out, half addrspace(1)* %arg_out ret void Index: test/CodeGen/AMDGPU/fcopysign.f32.ll =================================================================== --- test/CodeGen/AMDGPU/fcopysign.f32.ll +++ test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -8,12 +8,11 @@ ; Try to identify arg based on higher address. ; FUNC-LABEL: {{^}}test_copysign_f32: -; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb -; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc -; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c -; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30 -; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]] -; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]] +; SI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0xb +; VI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0x2c + +; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], s[[SSIGN]] +; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], s[[SMAG]] ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]] ; GCN: buffer_store_dword [[RESULT]], Index: test/CodeGen/AMDGPU/fcopysign.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fcopysign.f64.ll +++ test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -6,10 +6,10 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone ; FUNC-LABEL: {{^}}test_copysign_f64: -; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x1d +; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c +; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x74 ; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]] ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]] ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2 @@ -17,15 +17,15 @@ ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}} ; GCN: s_endpgm -define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind { +define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, [8 x i32], double %mag, [8 x i32], double %sign) nounwind { %result = call double @llvm.copysign.f64(double %mag, double %sign) store double %result, double addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}test_copysign_f64_f32: -; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN-DAG: s_load_dword s[[SSIGN:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}} ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]] @@ -33,7 +33,7 @@ ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]] ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}} -define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind { +define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, [8 x i32], double %mag, float %sign) nounwind { %c = fpext float %sign to double %result = call double @llvm.copysign.f64(double %mag, double %c) store double %result, double addrspace(1)* %out, align 8 Index: test/CodeGen/AMDGPU/fma.ll =================================================================== --- test/CodeGen/AMDGPU/fma.ll +++ test/CodeGen/AMDGPU/fma.ll @@ -64,9 +64,9 @@ ; SI: v_fma_f32 ; SI: v_fma_f32 ; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} -; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+$}} ; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} ; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} +; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+$}} ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}}, ; EG-DAG: FMA {{\*? *}}[[RES]].X Index: test/CodeGen/AMDGPU/fmin_legacy.ll =================================================================== --- test/CodeGen/AMDGPU/fmin_legacy.ll +++ test/CodeGen/AMDGPU/fmin_legacy.ll @@ -25,14 +25,12 @@ } ; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32: -; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-SAFE-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] -; SI-NONAN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]] -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]] +; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]] define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 { %cmp = fcmp ule float %a, %b Index: test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll =================================================================== --- test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -1,6 +1,6 @@ -; XUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s +; XUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't @@ -44,7 +44,7 @@ ; GCN-DAG: buffer_store_dword [[MUL2]] ; GCN-DAG: buffer_store_dword [[MAD]] ; GCN: s_endpgm -define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 { +define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, [8 x i32], float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %mul2 = fmul fast float %x, 2.0 %mad = fadd fast float %mul2, %y Index: test/CodeGen/AMDGPU/fmul.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fmul.f16.ll +++ test/CodeGen/AMDGPU/fmul.f16.ll @@ -17,8 +17,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fmul half %a.val, %b.val store half %r.val, half addrspace(1)* %r ret void @@ -36,7 +36,7 @@ half addrspace(1)* %r, half addrspace(1)* %b) { entry: - %b.val = load half, half addrspace(1)* %b + %b.val = load volatile half, half addrspace(1)* %b %r.val = fmul half 3.0, %b.val store half %r.val, half addrspace(1)* %r ret void @@ -55,24 +55,24 @@ half addrspace(1)* %r, half addrspace(1)* %a) { entry: - %a.val = load half, half addrspace(1)* %a + %a.val = load volatile half, half addrspace(1)* %a %r.val = fmul half %a.val, 4.0 store half %r.val, half addrspace(1)* %r ret void } ; GCN-LABEL: {{^}}fmul_v2f16: -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] - -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] -; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] +; SIVI: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SIVI: buffer_load_dword v[[A_V2_F16:[0-9]+]] + +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] @@ -82,6 +82,8 @@ ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] +; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; GCN: buffer_store_dword v[[R_V2_F16]] @@ -100,13 +102,13 @@ ; GCN-LABEL: {{^}}fmul_v2f16_imm_a: ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400 @@ -133,13 +135,13 @@ ; GCN-LABEL: {{^}}fmul_v2f16_imm_b: ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -164,13 +166,15 @@ } ; GCN-LABEL: {{^}}fmul_v4f16: -; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} -; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}} +; GFX9: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}} +; GFX9: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} ; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] ; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[MUL_HI]]{{\]}} +; VI: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} +; VI: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}} ; VI: v_mul_f16_sdwa ; VI: v_mul_f16_e32 ; VI: v_mul_f16_sdwa Index: test/CodeGen/AMDGPU/fneg-fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -109,8 +109,9 @@ } ; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16: -; CI: s_load_dword s -; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 +; CI: s_load_dword [[IN:s[0-9]+]] +; CI: s_or_b32 [[FNEG_FABS:s[0-9]+]], [[IN]], 0x80008000 +; CI: s_lshr_b32 ; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}} ; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}} ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} Index: test/CodeGen/AMDGPU/fneg-fabs.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -55,14 +55,13 @@ } ; GCN-LABEL: {{^}}fneg_fabs_f64: -; GCN-DAG: s_load_dwordx2 ; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}} -; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb -; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c +; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x13 +; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x4c ; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]] ; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}} -define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, [8 x i32], double %in) { %fabs = call double @llvm.fabs.f64(double %in) %fsub = fsub double -0.000000e+00, %fabs store double %fsub, double addrspace(1)* %out, align 8 Index: test/CodeGen/AMDGPU/fneg-fabs.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.ll +++ test/CodeGen/AMDGPU/fneg-fabs.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: ; SI-NOT: and -; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}| +; SI: v_sub_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{s[0-9]+}}| define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) %fsub = fsub float -0.000000e+00, %fabs @@ -15,7 +15,7 @@ ; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32: ; SI-NOT: and -; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}| +; SI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{s[0-9]+}}| ; SI-NOT: and define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) Index: test/CodeGen/AMDGPU/fneg.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f64.ll +++ test/CodeGen/AMDGPU/fneg.f64.ll @@ -48,11 +48,11 @@ } ; GCN-LABEL: {{^}}fneg_fold_f64: -; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN-NOT: xor ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]] -define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, [8 x i32], double %in) { %fsub = fsub double -0.0, %in %fmul = fmul double %fsub, %in store double %fmul, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/frame-index-amdgiz.ll =================================================================== --- test/CodeGen/AMDGPU/frame-index-amdgiz.ll +++ test/CodeGen/AMDGPU/frame-index-amdgiz.ll @@ -13,18 +13,17 @@ define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 { entry: ; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CHECK: s_load_dword s2, s[0:1], 0xb -; CHECK: s_load_dword s0, s[0:1], 0xc +; CHECK: s_load_dwordx2 s[0:1], s[0:1], 0xb ; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; CHECK: s_mov_b32 s10, -1 -; CHECK: s_waitcnt lgkmcnt(0) -; CHECK: s_lshl_b32 s1, s2, 2 ; CHECK: v_mov_b32_e32 v0, 4 +; CHECK: s_waitcnt lgkmcnt(0) +; CHECK: s_lshl_b32 s0, s0, 2 +; CHECK: v_add_i32_e32 v1, vcc, s0, v0 +; CHECK: s_lshl_b32 s0, s1, 2 ; CHECK: s_mov_b32 s11, 0xe8f000 -; CHECK: v_add_i32_e32 v1, vcc, s1, v0 ; CHECK: v_mov_b32_e32 v2, 7 -; CHECK: s_lshl_b32 s0, s0, 2 ; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen ; CHECK: v_add_i32_e32 v0, vcc, s0, v0 ; CHECK: s_mov_b32 s7, 0xf000 @@ -35,14 +34,14 @@ ; CHECK: s_endpgm %x = alloca [100 x i32], align 4, addrspace(5) - %0 = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0 + %alloca.bc = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %alloca.bc) #0 %arrayidx = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %i store i32 7, i32 addrspace(5)* %arrayidx, align 4 %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %j - %1 = load i32, i32 addrspace(5)* %arrayidx2, align 4 - store i32 %1, i32 addrspace(1)* %a, align 4 - call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0 + %ld = load i32, i32 addrspace(5)* %arrayidx2, align 4 + store i32 %ld, i32 addrspace(1)* %a, align 4 + call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %alloca.bc) #0 ret void } Index: test/CodeGen/AMDGPU/fsub.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fsub.f16.ll +++ test/CodeGen/AMDGPU/fsub.f16.ll @@ -17,8 +17,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fsub half %a.val, %b.val store half %r.val, half addrspace(1)* %r ret void @@ -36,7 +36,7 @@ half addrspace(1)* %r, half addrspace(1)* %b) { entry: - %b.val = load half, half addrspace(1)* %b + %b.val = load volatile half, half addrspace(1)* %b %r.val = fsub half 1.0, %b.val store half %r.val, half addrspace(1)* %r ret void @@ -54,33 +54,41 @@ half addrspace(1)* %r, half addrspace(1)* %a) { entry: - %a.val = load half, half addrspace(1)* %a + %a.val = load volatile half, half addrspace(1)* %a %r.val = fsub half %a.val, 2.0 store half %r.val, half addrspace(1)* %r ret void } ; GCN-LABEL: {{^}}fsub_v2f16: -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] + +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] -; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] +; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] + ; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] + +; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] + ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1] ; GCN: buffer_store_dword v[[R_V2_F16]] @@ -101,13 +109,13 @@ ; GCN-LABEL: {{^}}fsub_v2f16_imm_a: ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] @@ -135,13 +143,13 @@ ; GCN-LABEL: {{^}}fsub_v2f16_imm_b: ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] Index: test/CodeGen/AMDGPU/global_smrd.ll =================================================================== --- test/CodeGen/AMDGPU/global_smrd.ll +++ test/CodeGen/AMDGPU/global_smrd.ll @@ -5,7 +5,7 @@ ; CHECK: s_load_dwordx4 ; CHECK-NOT: flat_load_dword -define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, float addrspace(1)* %arg1) { +define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) { bb: %tmp2 = load float, float addrspace(1)* %arg, align 4, !tbaa !8 %tmp3 = fadd float %tmp2, 0.000000e+00 @@ -28,7 +28,7 @@ ; CHECK: flat_load_dword ; CHECK-NOT: s_load_dwordx4 -define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, float addrspace(1)* %arg1) #0 { +define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) #0 { bb: %tmp = call i32 @llvm.amdgcn.workitem.id.x() #1 %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp @@ -59,7 +59,7 @@ ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] -define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) { +define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) { store i32 0, i32 addrspace(1)* %out0 %val = load i32, i32 addrspace(1)* %in store i32 %val, i32 addrspace(1)* %out1 @@ -71,7 +71,7 @@ ; CHECK: flat_store_dword ; CHECK: flat_load_dword [[VVAL:v[0-9]+]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] -define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) { +define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) { store i32 0, i32 addrspace(1)* %out0 %val = load i32, i32 addrspace(1)* %in store i32 %val, i32 addrspace(1)* %out1 @@ -80,19 +80,20 @@ ; uniform load from global array ; CHECK-LABEL: @global_array -; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]] +; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]] +; CHECK: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0 +; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0 ; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0 ; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] - @A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4 define amdgpu_kernel void @global_array(i32 addrspace(1)* nocapture %out) { entry: - %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4 - %1 = load i32, i32 addrspace(1)* %0, align 4 - store i32 %1, i32 addrspace(1)* %out, align 4 + %load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4 + %load1 = load i32, i32 addrspace(1)* %load0, align 4 + store i32 %load1, i32 addrspace(1)* %out, align 4 ret void } @@ -105,13 +106,13 @@ ; CHECK: flat_load_dwordx2 [[A_ADDR:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}} ; CHECK: flat_load_dword [[VVAL:v[0-9]+]], [[A_ADDR]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] -define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, i32 %n) { +define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, [8 x i32], i32 %n) { entry: %gep = getelementptr i32, i32 addrspace(1) * %out, i32 %n store i32 12, i32 addrspace(1) * %gep - %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4 - %1 = load i32, i32 addrspace(1)* %0, align 4 - store i32 %1, i32 addrspace(1)* %out, align 4 + %load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4 + %load1 = load i32, i32 addrspace(1)* %load0, align 4 + store i32 %load1, i32 addrspace(1)* %out, align 4 ret void } Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -22,13 +22,8 @@ } ; GCN-LABEL: {{^}}load_v3f16_arg: -; SI: s_load_dwordx2 -; SI: s_load_dword s -; SI: s_load_dword s - -; VI: s_load_dwordx2 -; VI: s_load_dwordx2 - +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 ; GCN-NOT: {buffer|flat|global}}_load_ @@ -45,11 +40,7 @@ ; FIXME: Why not one load? ; GCN-LABEL: {{^}}load_v4f16_arg: -; SI-DAG: s_load_dword s[[ARG0_LO:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2 -; SI-DAG: s_load_dword s[[ARG0_HI:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x3 - -; VI: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 - +; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x2|0x8}} ; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], s[[ARG0_LO]] ; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], s[[ARG0_HI]] ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}} @@ -86,14 +77,8 @@ } ; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: -; SI: s_load_dwordx2 s -; SI: s_load_dword s -; SI: s_load_dword s - -; VI: s_load_dwordx2 -; VI: s_load_dwordx2 -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 - +; GCN: s_load_dwordx2 s +; GCN: s_load_dwordx2 s ; GCN-NOT: _load ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -116,14 +101,7 @@ } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s - -; VI: s_load_dwordx2 s -; VI: s_load_dwordx2 s -; VI: s_load_dwordx2 s +; GCN: s_load_dwordx4 ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -154,7 +132,7 @@ } ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: -; GCN: s_load_dword +; GCN-DAG: s_load_dword s ; GCN: s_lshr_b32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -169,14 +147,8 @@ } ; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: -; SI: s_load_dword -; SI: s_load_dword - -; VI: s_load_dwordx2 -; VI: s_load_dwordx2 - -; GCN: s_lshr_b32 - +; GCN: s_load_dwordx2 s +; GCN: s_load_dwordx2 s ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -191,19 +163,17 @@ } ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: -; SI: s_load_dword s -; SI: s_load_dword s - -; VI: s_load_dwordx2 s +; GCN: s_load_dwordx2 s +; GCN: s_load_dwordx2 s -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 ; GCN: s_endpgm define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { %ext = fpext <4 x half> %arg to <4 x double> @@ -212,14 +182,8 @@ } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: -; SI: s_load_dword s -; SI-NEXT: s_load_dword s -; SI-NEXT: s_load_dword s -; SI-NEXT: s_load_dword s -; SI-NOT: _load_ - -; VI: s_load_dwordx2 s -; VI: s_load_dwordx2 s +; GCN: s_load_dwordx2 s +; GCN: s_load_dwordx4 s ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -299,12 +263,13 @@ ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: ; GCN: flat_load_dword [[LOAD:v[0-9]+]], -; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] -; SI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] +; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] +; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] + ; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] -; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] +; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} ; GCN: s_endpgm @@ -343,6 +308,7 @@ ; GCN: flat_load_dwordx4 ; GCN: flat_load_dwordx4 +; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 @@ -351,7 +317,6 @@ ; GCN: flat_store_dwordx4 -; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 @@ -430,19 +395,19 @@ ; XVI-NOT: v_cvt_f32_f16 ; GCN: flat_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]] -; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] -; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] -; SI-DAG: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] -; SI-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] -; VI-DAG: v_cvt_f32_f16_sdwa [[Y32:v[0-9]+]], v[[IN_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 - -; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] -; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]] -; GCN-DAG: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]] +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; VI: v_cvt_f32_f16_sdwa +; GCN-NOT: v_cvt_f32_f16 + +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 ; GCN-NOT: v_cvt_f64_f32_e32 -; GCN-DAG: flat_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[XLO]]:[[YHI]]{{\]}} -; GCN-DAG: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[Z]] +; GCN-DAG: flat_store_dwordx4 +; GCN-DAG: flat_store_dwordx2 ; GCN: s_endpgm define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { %val = load <3 x half>, <3 x half> addrspace(1)* %in Index: test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -8,7 +8,7 @@ ; CHECK: Version: [ 1, 0 ] ; CHECK: Kernels: -; CHECK: - Name: test +; CHECK-LABEL: - Name: test ; CHECK: SymbolName: 'test@kd' ; CHECK: CodeProps: ; CHECK: KernargSegmentSize: 24 @@ -16,8 +16,8 @@ ; CHECK: PrivateSegmentFixedSize: 0 ; CHECK: KernargSegmentAlign: 8 ; CHECK: WavefrontSize: 64 -; CHECK: NumSGPRs: 6 -; CHECK: NumVGPRs: 3 +; CHECK: NumSGPRs: 8 +; CHECK: NumVGPRs: 6 ; CHECK: MaxFlatWorkGroupSize: 256 define amdgpu_kernel void @test( half addrspace(1)* %r, @@ -31,18 +31,24 @@ ret void } -; CHECK: - Name: num_spilled_sgprs +; CHECK-LABEL: - Name: num_spilled_sgprs ; CHECK: SymbolName: 'num_spilled_sgprs@kd' ; CHECK: CodeProps: -; CHECK: NumSpilledSGPRs: 41 +; GFX700: NumSpilledSGPRs: 40 +; GFX803: NumSpilledSGPRs: 24 +; GFX900: NumSpilledSGPRs: 24 define amdgpu_kernel void @num_spilled_sgprs( - i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %out2, - i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, i32 addrspace(1)* %out5, - i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, i32 addrspace(1)* %out8, - i32 addrspace(1)* %out9, i32 addrspace(1)* %outa, i32 addrspace(1)* %outb, - i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, i32 addrspace(1)* %oute, - i32 addrspace(1)* %outf, i32 %in0, i32 %in1, i32 %in2, i32 %in3, i32 %in4, - i32 %in5, i32 %in6, i32 %in7, i32 %in8, i32 %in9, i32 %ina, i32 %inb, + i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32], + i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32], + i32 addrspace(1)* %out4, i32 addrspace(1)* %out5, [8 x i32], + i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, [8 x i32], + i32 addrspace(1)* %out8, i32 addrspace(1)* %out9, [8 x i32], + i32 addrspace(1)* %outa, i32 addrspace(1)* %outb, [8 x i32], + i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, [8 x i32], + i32 addrspace(1)* %oute, i32 addrspace(1)* %outf, [8 x i32], + i32 %in0, i32 %in1, i32 %in2, i32 %in3, [8 x i32], + i32 %in4, i32 %in5, i32 %in6, i32 %in7, [8 x i32], + i32 %in8, i32 %in9, i32 %ina, i32 %inb, [8 x i32], i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 { entry: store i32 %in0, i32 addrspace(1)* %out0 @@ -64,7 +70,7 @@ ret void } -; CHECK: - Name: num_spilled_vgprs +; CHECK-LABEL: - Name: num_spilled_vgprs ; CHECK: SymbolName: 'num_spilled_vgprs@kd' ; CHECK: CodeProps: ; CHECK: NumSpilledVGPRs: 14 Index: test/CodeGen/AMDGPU/imm.ll =================================================================== --- test/CodeGen/AMDGPU/imm.ll +++ test/CodeGen/AMDGPU/imm.ll @@ -344,114 +344,114 @@ ; GCN-LABEL: {{^}}add_inline_imm_0.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}} ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 0.0 store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_0.5_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5 ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 0.5 store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5 ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, -0.5 store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_1.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0 ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 1.0 store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0 ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, -1.0 store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_2.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0 ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 2.0 store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0 ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, -2.0 store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_4.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0 ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 4.0 store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0 ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, -4.0 store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_inv_2pi_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; SI-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882 ; SI-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fc45f30 ; SI: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; VI: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.15915494{{$}} ; VI: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 0x3fc45f306dc9c882 store double %y, double addrspace(1)* %out ret void @@ -461,40 +461,40 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30 ; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 0xbfc45f306dc9c882 store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_1_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}} ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 0x0000000000000001 store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_2_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}} ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 0x0000000000000002 store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_16_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16 ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 0x0000000000000010 store double %y, double addrspace(1)* %out ret void @@ -504,7 +504,7 @@ ; GCN: v_mov_b32_e32 v0, -1 ; GCN: v_mov_b32_e32 v1, v0 ; GCN: buffer_store_dwordx2 v[0:1] -define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 0xffffffffffffffff store double %y, double addrspace(1)* %out ret void @@ -514,7 +514,7 @@ ; GCN: v_mov_b32_e32 v0, -2 ; GCN: v_mov_b32_e32 v1, -1 ; GCN: buffer_store_dwordx2 v[0:1] -define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 0xfffffffffffffffe store double %y, double addrspace(1)* %out ret void @@ -524,29 +524,29 @@ ; GCN: v_mov_b32_e32 v0, -16 ; GCN: v_mov_b32_e32 v1, -1 ; GCN: buffer_store_dwordx2 v[0:1] -define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 0xfffffffffffffff0 store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_63_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63 ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 0x000000000000003F store double %y, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_64_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64 ; GCN: buffer_store_dwordx2 [[REG]] -define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, [8 x i32], double %x) { %y = fadd double %x, 0x0000000000000040 store double %y, double addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/immv216.ll =================================================================== --- test/CodeGen/AMDGPU/immv216.ll +++ test/CodeGen/AMDGPU/immv216.ll @@ -310,9 +310,9 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_dword +; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800 +; VI-DAG: buffer_load_dword ; VI-NOT: and -; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} ; VI: v_or_b32 Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s ; FIXME: Broken on evergreen ; FIXME: For some reason the 8 and 16 vectors are being stored as @@ -75,8 +75,9 @@ ; GCN-LABEL: {{^}}insertelement_to_sgpr: ; GCN-NOT: v_readfirstlane -define amdgpu_ps <4 x float> @insertelement_to_sgpr(<4 x i32> inreg %samp) nounwind { - %tmp1 = insertelement <4 x i32> %samp, i32 0, i32 0 +define <4 x float> @insertelement_to_sgpr() nounwind { + %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef + %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0) ret <4 x float> %tmp2 } @@ -154,11 +155,11 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v4i32: -; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} +; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}} ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]] ; GCN: buffer_store_dwordx4 -define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 ret void @@ -201,23 +202,17 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v2i8: -; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28 +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c ; VI-NOT: _load -; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI: v_lshlrev_b16_e64 [[ELT1_SHIFT:v[0-9]+]], 8, [[ELT1]] -; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}} ; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1 - -; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[MASK]] -; VI: v_or_b32_e32 [[BUILD_VECTOR:v[0-9]+]], [[ELT0]], [[ELT1_SHIFT]] - -; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[BUILD_VECTOR]] -; VI-DAG: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]] -; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[BUILD_VECTOR]] +; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]] +; VI: v_xor_b32_e32 [[NOT_MASK:v[0-9]+]], -1, [[MASK]] +; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[LOAD]], [[NOT_MASK]] +; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[AND_NOT_MASK]] ; VI: buffer_store_short [[OR]] -define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { %vecins = insertelement <2 x i8> %a, i8 5, i32 %b store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 ret void @@ -227,68 +222,51 @@ ; isTypeDesirableForOp in SimplifyDemandedBits ; GCN-LABEL: {{^}}dynamic_insertelement_v3i8: -; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28 +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c ; VI-NOT: _load -; VI: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[LOAD]], 8 -; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[VEC_HI]] -; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}} -; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[VEC_HI]], [[ELT2]] -; VI: s_and_b32 [[ELT2:s[0-9]+]], [[LOAD]], 0xff0000{{$}} - -; VI: s_mov_b32 [[MASK16:s[0-9]+]], 0xffff{{$}} +; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], [[MASK16]], [[SCALED_IDX]] - -; VI: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] -; VI: v_or_b32_sdwa [[SDWA:v[0-9]+]], [[BUILD_VEC]], [[V_ELT2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI: s_not_b32 [[NOT_SHIFT_MASK:s[0-9]+]], [[SHIFTED_MASK]] -; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[NOT_SHIFT_MASK]], [[SDWA]] -; VI: v_lshrrev_b32_e32 [[HI2:v[0-9]+]], 16, [[AND_NOT_MASK]] -; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SCALED_IDX]], 5, [[SDWA]] -; VI: buffer_store_short [[BFI]] -; VI: buffer_store_byte [[HI2]] -define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind { +; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] +; VI: s_not_b32 [[NOT_MASK:s[0-9]+]], [[SHIFTED_MASK]] +; VI: s_and_b32 [[AND_NOT_MASK:s[0-9]+]], [[NOT_MASK]], [[LOAD]] +; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]] +; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16 + +; VI-DAG: buffer_store_short [[BFI]] +; VI-DAG: v_mov_b32_e32 [[V_HI2:v[0-9]+]], [[HI2]] +; VI: buffer_store_byte [[V_HI2]] +define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { %vecins = insertelement <3 x i8> %a, i8 5, i32 %b store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}dynamic_insertelement_v4i8: -; VI: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28 +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c ; VI-NOT: _load -; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 8 -; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]] -; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff{{$}} - - -; VI: s_lshr_b32 [[ELT3:s[0-9]+]], [[VEC]], 24 -; VI: s_lshr_b32 [[ELT2:s[0-9]+]], [[VEC]], 16 -; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, [[ELT3]] -; VI: v_or_b32_e32 -; VI: v_or_b32_sdwa +; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI: v_or_b32_sdwa -; VI: s_lshl_b32 -; VI: v_bfi_b32 -define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { +; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] +; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]] +; VI: buffer_store_dword [[BFI]] +define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { %vecins = insertelement <4 x i8> %a, i8 5, i32 %b store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}s_dynamic_insertelement_v8i8: -; VI-NOT: {{buffer|flat|global}} -; VI: s_load_dword [[IDX:s[0-9]]] -; VI-NOT: {{buffer|flat|global}} -; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 -; VI-NOT: {{buffer|flat|global}} +; VI-NOT: {{buffer|flat|global}}_load +; VI-DAG: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; VI-DAG: s_load_dword [[IDX:s[0-9]]], s[4:5], 0x10 +; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0{{$}} +; VI-DAG: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0 ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff ; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]] ; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}} @@ -307,13 +285,8 @@ ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8: ; GCN: s_load_dwordx2 +; GCN: s_load_dwordx4 ; GCN: s_load_dword s -; GCN: s_load_dword s -; GCN: s_load_dword s -; GCN: s_load_dword s -; GCN: s_load_dword s -; GCN-NOT: _load_ - ; GCN: buffer_store_byte ; GCN: buffer_store_byte @@ -368,7 +341,7 @@ ; GCN-LABEL: {{^}}dynamic_insertelement_v2f64: ; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}} -; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}} +; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18|0x60}}{{$}} ; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}} @@ -390,7 +363,7 @@ ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { %vecins = insertelement <2 x double> %a, double 8.0, i32 %b store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 ret void @@ -420,19 +393,18 @@ ; space is also 2x what should be required. ; GCN-LABEL: {{^}}dynamic_insertelement_v4f64: -; GCN: SCRATCH_RSRC_DWORD ; Stack store -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}} ; Write element -; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}} ; Stack reload -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}} ; Store result ; GCN: buffer_store_dwordx4 @@ -447,19 +419,17 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v8f64: -; GCN-DAG: SCRATCH_RSRC_DWORD - -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}} -; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}} ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1,9 +1,9 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s ; GCN-LABEL: {{^}}s_insertelement_v2i16_0: -; GCN: s_load_dword [[VEC:s[0-9]+]] +; GCN: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}} ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}} @@ -18,17 +18,17 @@ } ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reg: -; GCN: s_load_dword [[ELT0:s[0-9]+]] -; GCN: s_load_dword [[VEC:s[0-9]+]] +; GCN-DAG: s_load_dword [[ELT_LOAD:s[0-9]+]], s[4:5], +; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 -; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} +; CIVI-DAG: s_and_b32 [[ELT0:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}} ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}} ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] ; GFX9-NOT: [[ELT0]] ; GFX9-NOT: [[VEC]] -; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]] -define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 { +; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT_LOAD]], [[VEC]] +define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out @@ -36,29 +36,29 @@ } ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_multi_use_hi_reg: -; GCN: s_load_dword [[ELT0:s[0-9]+]] -; GCN: s_load_dword [[VEC:s[0-9]+]] +; GCN-DAG: s_load_dword [[ELT_LOAD:s[0-9]+]], s[4:5], +; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 -; CI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} +; CI-DAG: s_and_b32 [[ELT0_MASKED:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}} ; CI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 ; CI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16 -; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] +; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0_MASKED]], [[ELT1]] ; CI-DAG: ; use [[SHR]] ; FIXME: Should be able to void mask of upper bits -; VI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} +; VI-DAG: s_and_b32 [[ELT_MASKED:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}} ; VI-DAG: s_and_b32 [[VEC_HIMASK:s[0-9]+]], [[VEC]], 0xffff0000{{$}} -; VI: s_or_b32 [[OR:s[0-9]+]], [[ELT0]], [[VEC_HIMASK]] -; VI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 +; VI-DAG: s_or_b32 [[OR:s[0-9]+]], [[ELT_MASKED]], [[VEC_HIMASK]] +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 ; VI-DAG: ; use [[SHR]] ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 -; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]] +; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_LOAD]], [[ELT1]] ; GFX9-DAG: ; use [[ELT1]] -define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 { +define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt1 = extractelement <2 x i16> %vec, i32 1 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 @@ -69,16 +69,17 @@ } ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi: -; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1] -; GCN: s_load_dword [[VEC:s[0-9]+]] +; GCN-DAG: s_load_dword [[ELT_ARG:s[0-9]+]], s[4:5], +; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; CIVI: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}} -; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_ARG]], [[ELT1]] +; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[ELT1]] ; GFX9-NOT: [[ELT0]] ; GFX9-NOT: [[VEC]] ; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]] -define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 { +define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 @@ -88,7 +89,7 @@ } ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_multi_use_1: -; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1] +; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], ; GCN: s_load_dword [[VEC:s[0-9]+]], ; CIVI-DAG: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16 @@ -110,7 +111,7 @@ } ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_both_multi_use_1: -; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1] +; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], ; GCN: s_load_dword [[VEC:s[0-9]+]], ; CI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 @@ -161,15 +162,16 @@ } ; GCN-LABEL: {{^}}s_insertelement_v2i16_1_reg: -; GCN: s_load_dword [[ELT1:s[0-9]+]] -; GCN: s_load_dword [[VEC:s[0-9]+]] +; GCN-DAG: s_load_dword [[ELT1_LOAD:s[0-9]+]], s[4:5], +; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[ELT1_LOAD]], 16 ; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}} ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] ; GCN-NOT: shlr -; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]] -define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 { +; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1_LOAD]] +define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out @@ -444,12 +446,11 @@ ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] -; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] @@ -473,12 +474,11 @@ ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] -; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] @@ -501,17 +501,18 @@ } ; GCN-LABEL: {{^}}v_insertelement_v4f16_0: -; GCN-DAG: s_load_dword [[VAL:s[0-9]+]] +; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[4:5], ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}} ; GFX9: v_bfi_b32 v[[INS_LO:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[LO]] +; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}} ; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[LO]] -; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL]], [[AND]] +; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL_MASKED]], [[AND]] ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_LO]]:[[HI]]{{\]}} -define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { +define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext @@ -531,12 +532,13 @@ ; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]] ; GFX9: v_lshl_or_b32 v[[INS_HALF:[0-9]+]], [[VAL]], 16, [[AND]] -; VI: s_lshl_b32 [[VAL]], [[VAL]], 16 -; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]] +; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]] ; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], v[[LO]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]] -; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL]], [[AND]] +; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL_HI]], [[AND]] ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_HALF]]:[[HI]]{{\]}} define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { @@ -553,17 +555,18 @@ } ; GCN-LABEL: {{^}}v_insertelement_v4f16_2: -; GCN-DAG: s_load_dword [[VAL:s[0-9]+]] +; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[4:5], ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}} ; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]] +; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}} ; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]] -; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]] +; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]] ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}} -define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { +define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext @@ -583,12 +586,13 @@ ; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]] ; GFX9: v_lshl_or_b32 v[[INS_HI:[0-9]+]], [[VAL]], 16, [[AND]] -; VI: s_lshl_b32 [[VAL]], [[VAL]], 16 -; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]] +; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]] ; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], v[[HI]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]] -; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]] +; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_HI]], [[AND]] ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}} define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { @@ -611,8 +615,9 @@ ; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}} ; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]] +; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}} ; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]] -; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]] +; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]] ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}} define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { Index: test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- test/CodeGen/AMDGPU/kernel-args.ll +++ test/CodeGen/AMDGPU/kernel-args.ll @@ -210,8 +210,10 @@ ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 -; GCN: s_load_dword s -; GCN-NOT: {{buffer|flat|global}}_load_ +; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb + +; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { entry: store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 @@ -226,8 +228,7 @@ ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 -; SI: s_load_dword s -; SI: s_load_dword s +; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 ; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c @@ -236,6 +237,7 @@ store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 ret void } + ; FUNC-LABEL: {{^}}v3i32_arg: ; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 @@ -274,8 +276,8 @@ ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; GCN: s_load_dword s -; GCN-NOT: {{buffer|flat|global}}_load_ +; GCN-DAG: s_load_dwordx2 s +; GCN-DAG: s_load_dword s define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(1)* %out @@ -290,12 +292,18 @@ ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9 -; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x2c -; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c + + +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c + +; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { entry: store <4 x i16> %in, <4 x i16> addrspace(1)* %out @@ -348,23 +356,16 @@ ; EG: VTX_READ_8 ; EG: VTX_READ_8 - -; SI: s_load_dword s -; SI: s_load_dword s +; SI-NOT: {{buffer|flat|global}}_load ; SI: s_load_dwordx2 s +; SI-NEXT: s_load_dwordx2 s ; SI-NOT: {{buffer|flat|global}}_load -; VI: s_load_dword s -; VI: s_load_dword s - -; VI: v_lshlrev_b16 -; VI: v_or_b32_e32 -; VI: v_or_b32_sdwa -; VI: v_or_b32_sdwa -; VI: v_lshlrev_b16 -; VI: s_lshr_b32 -; VI: v_or_b32_sdwa -; VI: v_or_b32_sdwa +; VI: s_load_dwordx2 s +; VI-NEXT: s_load_dwordx2 s +; VI-NOT: lshl +; VI-NOT: _or +; VI-NOT: _sdwa define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { entry: store <8 x i8> %in, <8 x i8> addrspace(1)* %out @@ -383,19 +384,14 @@ ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dwordx2 +; SI: s_load_dwordx4 +; SI-NEXT: s_load_dwordx2 ; SI-NOT: {{buffer|flat|global}}_load -; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34 -; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x3c +; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x18 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { entry: store <8 x i16> %in, <8 x i16> addrspace(1)* %out @@ -413,6 +409,7 @@ ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X + ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 ; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 @@ -462,33 +459,16 @@ ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dwordx2 +; SI: s_load_dwordx4 s +; SI-NEXT: s_load_dwordx2 s ; SI-NOT: {{buffer|flat|global}}_load -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s - -; VI: s_lshr_b32 -; VI: v_lshlrev_b16 -; VI: s_lshr_b32 -; VI: s_lshr_b32 -; VI: v_or_b32_sdwa -; VI: v_or_b32_sdwa -; VI: v_lshlrev_b16 -; VI: v_lshlrev_b16 -; VI: v_or_b32_sdwa -; VI: v_or_b32_sdwa -; VI: v_lshlrev_b16 -; VI: v_lshlrev_b16 -; VI: v_or_b32_sdwa -; VI: v_or_b32_sdwa +; VI: s_load_dwordx4 s +; VI-NOT: shr +; VI-NOT: shl +; VI-NOT: _sdwa +; VI-NOT: _or_ define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { entry: store <16 x i8> %in, <16 x i8> addrspace(1)* %out @@ -516,27 +496,14 @@ ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s -; SI: s_load_dword s - +; SI: s_load_dwordx8 s +; SI-NEXT: s_load_dwordx2 s ; SI-NOT: {{buffer|flat|global}}_load -; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 -; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x4c -; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x54 -; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x5c +; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x28 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x38 +; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { entry: store <16 x i16> %in, <16 x i16> addrspace(1)* %out @@ -600,22 +567,21 @@ } ; FUNC-LABEL: {{^}}kernel_arg_i64: -; MESA-GCN: s_load_dwordx2 -; MESA-GCN: s_load_dwordx2 +; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24 +; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 + ; MESA-GCN: buffer_store_dwordx2 -; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { store i64 %a, i64 addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}f64_kernel_arg: -; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 -; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb -; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 -; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c +; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 +; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 ; MESA-GCN: buffer_store_dwordx2 -; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 + +; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { entry: store double %in, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -6,7 +6,7 @@ ; GCN: s_load_dword s[[LO:[0-9]+]] ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] ; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, half %data, i32 %index) { +define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) { main_body: call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll @@ -4,8 +4,8 @@ declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b) ; GCN-LABEL: {{^}}class_f16: -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_I32:[0-9]+]] +; GCN-DAG: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN-DAG: buffer_load_dword v[[B_I32:[0-9]+]] ; VI: v_cmp_class_f16_e32 vcc, v[[A_F16]], v[[B_I32]] ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] @@ -33,7 +33,9 @@ ; GCN: s_endpgm define amdgpu_kernel void @class_f16_fabs( i32 addrspace(1)* %r, + [8 x i32], half %a.val, + [8 x i32], i32 %b.val) { entry: %a.val.fabs = call half @llvm.fabs.f16(half %a.val) @@ -53,7 +55,9 @@ ; GCN: s_endpgm define amdgpu_kernel void @class_f16_fneg( i32 addrspace(1)* %r, + [8 x i32], half %a.val, + [8 x i32], i32 %b.val) { entry: %a.val.fneg = fsub half -0.0, %a.val @@ -73,7 +77,9 @@ ; GCN: s_endpgm define amdgpu_kernel void @class_f16_fabs_fneg( i32 addrspace(1)* %r, + [8 x i32], half %a.val, + [8 x i32], i32 %b.val) { entry: %a.val.fabs = call half @llvm.fabs.f16(half %a.val) Index: test/CodeGen/AMDGPU/llvm.amdgcn.class.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -1,4 +1,4 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s declare i1 @llvm.amdgcn.class.f32(float, i32) #1 declare i1 @llvm.amdgcn.class.f64(double, i32) #1 @@ -7,14 +7,14 @@ declare double @llvm.fabs.f64(double) #1 ; SI-LABEL: {{^}}test_class_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -22,14 +22,14 @@ } ; SI-LABEL: {{^}}test_class_fabs_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 { %a.fabs = call float @llvm.fabs.f32(float %a) #1 %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1 %sext = sext i1 %result to i32 @@ -38,14 +38,14 @@ } ; SI-LABEL: {{^}}test_class_fneg_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 { %a.fneg = fsub float -0.0, %a %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1 %sext = sext i1 %result to i32 @@ -54,14 +54,14 @@ } ; SI-LABEL: {{^}}test_class_fneg_fabs_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 { %a.fabs = call float @llvm.fabs.f32(float %a) #1 %a.fneg.fabs = fsub float -0.0, %a.fabs %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1 @@ -183,14 +183,14 @@ } ; SI-LABEL: {{^}}test_class_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]] ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -198,14 +198,14 @@ } ; SI-LABEL: {{^}}test_class_fabs_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 { %a.fabs = call double @llvm.fabs.f64(double %a) #1 %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1 %sext = sext i1 %result to i32 @@ -214,14 +214,14 @@ } ; SI-LABEL: {{^}}test_class_fneg_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 { %a.fneg = fsub double -0.0, %a %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1 %sext = sext i1 %result to i32 @@ -230,14 +230,14 @@ } ; SI-LABEL: {{^}}test_class_fneg_fabs_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 { %a.fabs = call double @llvm.fabs.f64(double %a) #1 %a.fneg.fabs = fsub double -0.0, %a.fabs %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1 @@ -268,14 +268,14 @@ ; Set all 9 bits of mask ; SI-LABEL: {{^}}test_class_full_mask_f64: -; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 ; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]] ; SI-NOT: vcc ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { +define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, [8 x i32], double %a) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll @@ -4,11 +4,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pk_i16_i32: -; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}} -; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}} -; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]] -; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[X]], [[VY]] -; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[X]], [[VY]] +; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]] +; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, s[[SX]], [[VY]] +; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, s[[SX]], [[VY]] define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y) %r = bitcast <2 x i16> %result to i32 Index: test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll @@ -4,11 +4,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pk_u16_u32: -; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}} -; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}} -; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]] -; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[X]], [[VY]] -; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[X]], [[VY]] +; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]] +; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, s[[SX]], [[VY]] +; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, s[[SX]], [[VY]] define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y) %r = bitcast <2 x i16> %result to i32 Index: test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll @@ -4,11 +4,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32: -; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}} -; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}} -; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]] -; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]] -; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[X]], [[VY]] +; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]] +; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]] +; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, s[[SX]], [[VY]] define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 { %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y) %r = bitcast <2 x i16> %result to i32 Index: test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll @@ -4,11 +4,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32: -; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}} -; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}} -; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]] -; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]] -; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[X]], [[VY]] +; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]] +; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]] +; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, s[[SX]], [[VY]] define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 { %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y) %r = bitcast <2 x i16> %result to i32 Index: test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -3,11 +3,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}s_cvt_pkrtz_v2f16_f32: -; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}} -; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}} -; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]] -; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]] -; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, [[X]], [[VY]] +; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]] +; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]] +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, s[[SX]], [[VY]] define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) store <2 x half> %result, <2 x half> addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll @@ -15,9 +15,9 @@ half addrspace(1)* %b, half addrspace(1)* %c) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b - %c.val = load half, half addrspace(1)* %c + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b + %c.val = load volatile half, half addrspace(1)* %c %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half %c.val) store half %r.val, half addrspace(1)* %r ret void @@ -35,8 +35,8 @@ half addrspace(1)* %b, half addrspace(1)* %c) { entry: - %b.val = load half, half addrspace(1)* %b - %c.val = load half, half addrspace(1)* %c + %b.val = load volatile half, half addrspace(1)* %b + %c.val = load volatile half, half addrspace(1)* %c %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half %b.val, half %c.val) store half %r.val, half addrspace(1)* %r ret void @@ -54,8 +54,8 @@ half addrspace(1)* %a, half addrspace(1)* %c) { entry: - %a.val = load half, half addrspace(1)* %a - %c.val = load half, half addrspace(1)* %c + %a.val = load volatile half, half addrspace(1)* %a + %c.val = load volatile half, half addrspace(1)* %c %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half 3.0, half %c.val) store half %r.val, half addrspace(1)* %r ret void @@ -73,8 +73,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half 3.0) store half %r.val, half addrspace(1)* %r ret void @@ -90,7 +90,7 @@ half addrspace(1)* %r, half addrspace(1)* %c) { entry: - %c.val = load half, half addrspace(1)* %c + %c.val = load volatile half, half addrspace(1)* %c %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half 3.0, half %c.val) store half %r.val, half addrspace(1)* %r ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll @@ -5,18 +5,20 @@ declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readnone ; GCN-LABEL: {{^}}test_div_fixup_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25 + +; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c +; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70 +; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94 + ; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { +define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) nounwind { %result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s -; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI %s +; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI %s ; FIXME: Enable for VI. @@ -8,33 +8,36 @@ declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone ; GCN-LABEL: {{^}}test_div_fmas_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25 + +; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c +; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70 +; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94 + +; GCN-DAG: s_and_b32 [[AND_I1:s[0-9]+]], 1, s{{[0-9]+}} +; GCN: v_cmp_eq_u32_e64 vcc, [[AND_I1]], 1 + ; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]] ; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0: -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25 ; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]] ; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -43,26 +46,32 @@ ; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1: ; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { + +; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c +; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94 + +; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] +; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] +; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]] +; GCN: buffer_store_dword [[RESULT]], +define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0 -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c + +; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c +; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70 + +; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] +; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0 +; GCN: buffer_store_dword [[RESULT]], +define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -77,8 +86,8 @@ } ; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc: -; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} +; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind { %cmp = icmp eq i32 %i, 0 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone @@ -87,8 +96,8 @@ } ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc: -; SI: s_mov_b64 vcc, 0 -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; GCN: s_mov_b64 vcc, 0 +; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone store float %result, float addrspace(1)* %out, align 4 @@ -96,8 +105,8 @@ } ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc: -; SI: s_mov_b64 vcc, -1 -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; GCN: s_mov_b64 vcc, -1 +; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone store float %result, float addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll @@ -230,13 +230,13 @@ } ; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_1: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c ; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind { +define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone %result0 = extractvalue { float, i1 } %result, 0 store float %result0, float addrspace(1)* %out, align 4 @@ -244,13 +244,13 @@ } ; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_2: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind { +define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone %result0 = extractvalue { float, i1 } %result, 0 store float %result0, float addrspace(1)* %out, align 4 @@ -258,14 +258,14 @@ } ; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_1: -; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x1d ; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]] ; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]] ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}} ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind { +define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind { %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone %result0 = extractvalue { double, i1 } %result, 0 store double %result0, double addrspace(1)* %out, align 8 @@ -273,14 +273,14 @@ } ; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_2: -; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x1d ; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]] ; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]] ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind { +define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind { %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone %result0 = extractvalue { double, i1 } %result, 0 store double %result0, double addrspace(1)* %out, align 8 Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -7,7 +7,7 @@ ; GCN: s_load_dword s[[S_LO:[0-9]+]] ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] ; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen -define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { +define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %vindex) { main_body: call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) ret void @@ -41,7 +41,6 @@ ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] ; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen - ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] ; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen Index: test/CodeGen/AMDGPU/llvm.ceil.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -24,14 +24,14 @@ ; GCN-LABEL: {{^}}ceil_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] Index: test/CodeGen/AMDGPU/llvm.cos.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -24,26 +24,31 @@ } ; GCN-LABEL: {{^}}cos_v2f16 -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI-DAG: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}} -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI-DAG: v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}} + ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]] +; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] +; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]] +; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]] ; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]] +; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] +; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] + +; GCN: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] +; GCN: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]] -; GCN-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] -; GCN-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] -; GCN-DAG: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]] -; GCN-DAG: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GCN-NOT: and Index: test/CodeGen/AMDGPU/llvm.dbg.value.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -2,11 +2,9 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,OPT %s ; GCN-LABEL: {{^}}test_debug_value: -; NOOPT: s_load_dwordx2 s[4:5] - -; FIXME: Why is the SGPR4_SGPR5 reference being removed from DBG_VALUE? -; NOOPT: ; kill: def $sgpr8_sgpr9 killed $sgpr4_sgpr5 -; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- undef +; NOOPT: .loc 1 1 42 prologue_end ; /tmp/test_debug_value.cl:1:42 +; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- $sgpr4_sgpr5 ; GCN: flat_store_dword ; GCN: s_endpgm Index: test/CodeGen/AMDGPU/llvm.floor.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -24,13 +24,13 @@ ; GCN-LABEL: {{^}}floor_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] Index: test/CodeGen/AMDGPU/llvm.fma.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -105,16 +105,17 @@ ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] + ; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]] @@ -145,8 +146,9 @@ } ; GCN-LABEL: {{^}}fma_v2f16_imm_a: -; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] + ; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] @@ -157,13 +159,14 @@ ; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]] +; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] + +; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]] +; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]] @@ -186,8 +189,8 @@ } ; GCN-LABEL: {{^}}fma_v2f16_imm_b: -; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] @@ -195,10 +198,10 @@ ; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] @@ -229,8 +232,8 @@ } ; GCN-LABEL: {{^}}fma_v2f16_imm_c: -; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] @@ -238,26 +241,31 @@ ; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] + +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] + +; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]] +; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]] ; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]] - ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] + + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_c( Index: test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -58,8 +58,8 @@ half addrspace(1)* %r, half addrspace(1)* %b, half addrspace(1)* %c) { - %b.val = load half, half addrspace(1)* %b - %c.val = load half, half addrspace(1)* %c + %b.val = load volatile half, half addrspace(1)* %b + %c.val = load volatile half, half addrspace(1)* %c %r.val = call half @llvm.fmuladd.f16(half 3.0, half %b.val, half %c.val) store half %r.val, half addrspace(1)* %r ret void @@ -87,56 +87,64 @@ half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %c) { - %a.val = load half, half addrspace(1)* %a - %c.val = load half, half addrspace(1)* %c + %a.val = load volatile half, half addrspace(1)* %a + %c.val = load volatile half, half addrspace(1)* %c %r.val = call half @llvm.fmuladd.f16(half %a.val, half 3.0, half %c.val) store half %r.val, half addrspace(1)* %r ret void } ; GCN-LABEL: {{^}}fmuladd_v2f16 -; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] + +; VI-FLUSH: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; VI-FLUSH: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; VI-FLUSH: buffer_load_dword v[[B_V2_F16:[0-9]+]] + +; VI-DENORM: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] + +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] - -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] -; SI: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] + + +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] +; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] +; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] -; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] -; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]] + +; VI-FLUSH: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; VI-FLUSH-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-FLUSH-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]] +; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] ; VI-FLUSH-NOT: v_and_b32 -; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]] +; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]] ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]], v[[C_V2_F16]] -; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]], v[[C_F16_1]] +; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] +; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] ; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]] ; VI-DENORM-NOT: v_and_b32 ; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm - define amdgpu_kernel void @fmuladd_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, Index: test/CodeGen/AMDGPU/llvm.maxnum.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -22,8 +22,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val) store half %r.val, half addrspace(1)* %r ret void @@ -66,17 +66,16 @@ } ; GCN-LABEL: {{^}}maxnum_v2f16: -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] - -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] ; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] @@ -89,7 +88,7 @@ ; VI-NOT: and ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] -; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] +; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -107,13 +106,13 @@ ; GCN-LABEL: {{^}}maxnum_v2f16_imm_a: ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 ; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] @@ -127,7 +126,6 @@ ; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] ; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @maxnum_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { @@ -140,13 +138,13 @@ ; GCN-LABEL: {{^}}maxnum_v2f16_imm_b: ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200 ; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -162,7 +160,6 @@ ; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]] ; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @maxnum_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { @@ -192,8 +189,8 @@ ; GCN-LABEL: {{^}}maxnum_v4f16: ; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} ; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}} -; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] -; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] +; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] +; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}} define amdgpu_kernel void @maxnum_v4f16( <4 x half> addrspace(1)* %r, Index: test/CodeGen/AMDGPU/llvm.minnum.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -22,8 +22,8 @@ half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val) store half %r.val, half addrspace(1)* %r ret void @@ -66,32 +66,31 @@ } ; GCN-LABEL: {{^}}minnum_v2f16: -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] ; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] -; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] +; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] ; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @minnum_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -106,22 +105,21 @@ ; GCN-LABEL: {{^}}minnum_v2f16_imm_a: ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] - -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] -; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] - ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 ; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SIVI-NOT: and -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] + ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200 ; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] @@ -139,26 +137,28 @@ ; GCN-LABEL: {{^}}minnum_v2f16_imm_b: ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] + ; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200 ; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400 -; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]] - ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] + + ; SIVI-NOT: and ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400 +; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]] + ; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @minnum_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { @@ -188,8 +188,8 @@ ; GCN-LABEL: {{^}}minnum_v4f16: ; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} ; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}} -; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] -; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] +; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] +; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}} define amdgpu_kernel void @minnum_v4f16( <4 x half> addrspace(1)* %r, Index: test/CodeGen/AMDGPU/llvm.rint.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -25,13 +25,13 @@ ; GCN-LABEL: {{^}}rint_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: v_and_b32 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] Index: test/CodeGen/AMDGPU/llvm.sin.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -1,10 +1,10 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s declare half @llvm.sin.f16(half %a) declare <2 x half> @llvm.sin.v2f16(<2 x half> %a) -; GCN-LABEL: {{^}}sin_f16 +; GCN-LABEL: {{^}}sin_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; GCN: v_mul_f32_e32 v[[M_F32:[0-9]+]], {{0.15915494|0x3e22f983}}, v[[A_F32]] @@ -23,16 +23,20 @@ ret void } -; GCN-LABEL: {{^}}sin_v2f16 +; GCN-LABEL: {{^}}sin_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}} -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]] -; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]] -; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] +; SI: v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}} + +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]] +; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] +; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]] +; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] +; SI: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] +; SI: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -40,12 +44,11 @@ ; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]] ; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] ; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] +; VI: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] +; VI: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]] -; GCN-DAG: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]] -; GCN-DAG: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] Index: test/CodeGen/AMDGPU/llvm.trunc.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -24,13 +24,13 @@ ; GCN-LABEL: {{^}}trunc_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: v_and_b32 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] Index: test/CodeGen/AMDGPU/load-select-ptr.ll =================================================================== --- test/CodeGen/AMDGPU/load-select-ptr.ll +++ test/CodeGen/AMDGPU/load-select-ptr.ll @@ -17,7 +17,7 @@ ; GCN-NOT: load_dword ; GCN: flat_store_dwordx2 -define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, i64* %ptr0, i64* %ptr1, i64 addrspace(1)* %ptr2) { +define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], i64* %ptr0, [8 x i32], i64* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) { %tmp2 = icmp eq i32 %tmp, 0 %tmp3 = load i64, i64* %ptr0, align 8 %tmp4 = load i64, i64* %ptr1, align 8 @@ -38,7 +38,7 @@ ; GCN: v_cndmask_b32 ; GCN: v_cndmask_b32 ; GCN: flat_store_dwordx2 -define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, i64 addrspace(1)* %ptr0, i64 addrspace(1)* %ptr1, i64 addrspace(1)* %ptr2) { +define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], i64 addrspace(1)* %ptr0, [8 x i32], i64 addrspace(1)* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) { %tmp2 = icmp eq i32 %tmp, 0 %tmp3 = load i64, i64 addrspace(1)* %ptr0, align 8 %tmp4 = load i64, i64 addrspace(1)* %ptr1, align 8 Index: test/CodeGen/AMDGPU/lower-kernargs.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/lower-kernargs.ll @@ -0,0 +1,1286 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; FIXME: Manually added checks for metadata nodes at bottom +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=HSA %s +; RUN: opt -mtriple=amdgcn-- -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=MESA %s + +define amdgpu_kernel void @kern_noargs() { +; HSA-LABEL: @kern_noargs( +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_noargs( +; MESA-NEXT: ret void +; + ret void +} + +define amdgpu_kernel void @kern_i8(i8 %arg) #0 { +; HSA-LABEL: @kern_i8( +; HSA-NEXT: [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]] to [[KERN_I8:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; HSA-NEXT: store i8 [[TMP4]], i8 addrspace(1)* undef, align 1 +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_i8( +; MESA-NEXT: [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I8:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; MESA-NEXT: store i8 [[TMP5]], i8 addrspace(1)* undef, align 1 +; MESA-NEXT: ret void +; + store i8 %arg, i8 addrspace(1)* undef, align 1 + ret void +} + +define amdgpu_kernel void @kern_i16(i16 %arg) #0 { +; HSA-LABEL: @kern_i16( +; HSA-NEXT: [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]] to [[KERN_I16:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 +; HSA-NEXT: store i16 [[TMP4]], i16 addrspace(1)* undef, align 1 +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_i16( +; MESA-NEXT: [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I16:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; MESA-NEXT: store i16 [[TMP5]], i16 addrspace(1)* undef, align 1 +; MESA-NEXT: ret void +; + store i16 %arg, i16 addrspace(1)* undef, align 1 + ret void +} + +define amdgpu_kernel void @kern_f16(half %arg) #0 { +; HSA-LABEL: @kern_f16( +; HSA-NEXT: [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]] to [[KERN_F16:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 +; HSA-NEXT: [[ARG_LOAD:%.*]] = bitcast i16 [[TMP4]] to half +; HSA-NEXT: store half [[ARG_LOAD]], half addrspace(1)* undef, align 1 +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_f16( +; MESA-NEXT: [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_F16:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; MESA-NEXT: [[ARG_LOAD:%.*]] = bitcast i16 [[TMP5]] to half +; MESA-NEXT: store half [[ARG_LOAD]], half addrspace(1)* undef, align 1 +; MESA-NEXT: ret void +; + store half %arg, half addrspace(1)* undef, align 1 + ret void +} + +define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 { +; HSA-LABEL: @kern_zeroext_i8( +; HSA-NEXT: [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]] to [[KERN_ZEROEXT_I8:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; HSA-NEXT: store i8 [[TMP4]], i8 addrspace(1)* undef, align 1 +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_zeroext_i8( +; MESA-NEXT: [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ZEROEXT_I8:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !1, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; MESA-NEXT: store i8 [[TMP5]], i8 addrspace(1)* undef, align 1 +; MESA-NEXT: ret void +; + store i8 %arg, i8 addrspace(1)* undef, align 1 + ret void +} + +define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 { +; HSA-LABEL: @kern_zeroext_i16( +; HSA-NEXT: [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]] to [[KERN_ZEROEXT_I16:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 +; HSA-NEXT: store i16 [[TMP4]], i16 addrspace(1)* undef, align 1 +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_zeroext_i16( +; MESA-NEXT: [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ZEROEXT_I16:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !2, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; MESA-NEXT: store i16 [[TMP5]], i16 addrspace(1)* undef, align 1 +; MESA-NEXT: ret void +; + store i16 %arg, i16 addrspace(1)* undef, align 1 + ret void +} + +define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 { +; HSA-LABEL: @kern_signext_i8( +; HSA-NEXT: [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]] to [[KERN_SIGNEXT_I8:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; HSA-NEXT: store i8 [[TMP4]], i8 addrspace(1)* undef, align 1 +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_signext_i8( +; MESA-NEXT: [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_SIGNEXT_I8:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !3, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; MESA-NEXT: store i8 [[TMP5]], i8 addrspace(1)* undef, align 1 +; MESA-NEXT: ret void +; + store i8 %arg, i8 addrspace(1)* undef, align 1 + ret void +} + +define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 { +; HSA-LABEL: @kern_signext_i16( +; HSA-NEXT: [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]] to [[KERN_SIGNEXT_I16:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 +; HSA-NEXT: store i16 [[TMP4]], i16 addrspace(1)* undef, align 1 +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_signext_i16( +; MESA-NEXT: [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_SIGNEXT_I16:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; MESA-NEXT: store i16 [[TMP5]], i16 addrspace(1)* undef, align 1 +; MESA-NEXT: ret void +; + store i16 %arg, i16 addrspace(1)* undef, align 1 + ret void +} + +define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) { +; HSA-LABEL: @kern_i8_i8( +; HSA-NEXT: [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]] to [[KERN_I8_I8:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 +; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 +; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef, align 1 +; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1 +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_i8_i8( +; MESA-NEXT: [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I8_I8:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 +; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 +; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1 +; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef, align 1 +; MESA-NEXT: ret void +; + store volatile i8 %arg0, i8 addrspace(1)* undef, align 1 + store volatile i8 %arg1, i8 addrspace(1)* undef, align 1 + ret void +} + +define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) { +; HSA-LABEL: @kern_v3i8( +; HSA-NEXT: [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]] to [[KERN_V3I8:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24 +; HSA-NEXT: [[ARG_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8> +; HSA-NEXT: store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_v3i8( +; MESA-NEXT: [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_V3I8:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i24 +; MESA-NEXT: [[ARG_LOAD:%.*]] = bitcast i24 [[TMP5]] to <3 x i8> +; MESA-NEXT: store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4 +; MESA-NEXT: ret void +; + store <3 x i8> %arg, <3 x i8> addrspace(1)* undef, align 4 + ret void +} + +define amdgpu_kernel void @kern_i24(i24 %arg0) { +; HSA-LABEL: @kern_i24( +; HSA-NEXT: [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]] to [[KERN_I24:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24 +; HSA-NEXT: store i24 [[TMP4]], i24 addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_i24( +; MESA-NEXT: [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I24:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i24 +; MESA-NEXT: store i24 [[TMP5]], i24 addrspace(1)* undef +; MESA-NEXT: ret void +; + store i24 %arg0, i24 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_i32(i32 %arg0) { +; HSA-LABEL: @kern_i32( +; HSA-NEXT: [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]] to [[KERN_I32:%.*]] addrspace(4)* +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32]], [[KERN_I32]] addrspace(4)* [[TMP1]], i32 0, i32 0 +; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_i32( +; MESA-NEXT: [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I32:%.*]] addrspace(4)* +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32]], [[KERN_I32]] addrspace(4)* [[TMP2]], i32 0, i32 0 +; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef +; MESA-NEXT: ret void +; + store i32 %arg0, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_f32(float %arg0) { +; HSA-LABEL: @kern_f32( +; HSA-NEXT: [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]] to [[KERN_F32:%.*]] addrspace(4)* +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_F32]], [[KERN_F32]] addrspace(4)* [[TMP1]], i32 0, i32 0 +; HSA-NEXT: [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: store float [[ARG0_LOAD]], float addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_f32( +; MESA-NEXT: [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_F32:%.*]] addrspace(4)* +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_F32]], [[KERN_F32]] addrspace(4)* [[TMP2]], i32 0, i32 0 +; MESA-NEXT: [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: store float [[ARG0_LOAD]], float addrspace(1)* undef +; MESA-NEXT: ret void +; + store float %arg0, float addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) { +; HSA-LABEL: @kern_v3i32( +; HSA-NEXT: [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]] to [[KERN_V3I32:%.*]] addrspace(4)* +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_V3I32]], [[KERN_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 0 +; HSA-NEXT: [[TMP2:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP2]], align 16, !invariant.load !0 +; HSA-NEXT: [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <3 x i32> +; HSA-NEXT: store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_v3i32( +; MESA-NEXT: [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_V3I32:%.*]] addrspace(4)* +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_V3I32]], [[KERN_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 0 +; MESA-NEXT: [[TMP3:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP3]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <3 x i32> +; MESA-NEXT: store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4 +; MESA-NEXT: ret void +; + store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef, align 4 + ret void +} + +define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) { +; HSA-LABEL: @kern_i32_v3i32( +; HSA-NEXT: [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]] to [[KERN_I32_V3I32:%.*]] addrspace(4)* +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 0 +; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 1 +; HSA-NEXT: [[TMP2:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP2]], align 16, !invariant.load !0 +; HSA-NEXT: [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <3 x i32> +; HSA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef +; HSA-NEXT: store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_i32_v3i32( +; MESA-NEXT: [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I32_V3I32:%.*]] addrspace(4)* +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 0 +; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 1 +; MESA-NEXT: [[TMP3:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP3]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <3 x i32> +; MESA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef +; MESA-NEXT: store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4 +; MESA-NEXT: ret void +; + store i32 %arg0, i32 addrspace(1)* undef + store <3 x i32> %arg1, <3 x i32> addrspace(1)* undef, align 4 + ret void +} + +%struct.a = type { i32, i8, [4 x i8] } +%struct.b.packed = type { i8, i32, [3 x i16], <2 x double> } + +define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) { +; HSA-LABEL: @kern_struct_a( +; HSA-NEXT: [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]] to [[KERN_STRUCT_A:%.*]] addrspace(4)* +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_A]], [[KERN_STRUCT_A]] addrspace(4)* [[TMP1]], i32 0, i32 0 +; HSA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_struct_a( +; MESA-NEXT: [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_STRUCT_A:%.*]] addrspace(4)* +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_A]], [[KERN_STRUCT_A]] addrspace(4)* [[TMP2]], i32 0, i32 0 +; MESA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef +; MESA-NEXT: ret void +; + store %struct.a %arg0, %struct.a addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 { +; HSA-LABEL: @kern_struct_b_packed( +; HSA-NEXT: [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]] to [[KERN_STRUCT_B_PACKED:%.*]] addrspace(4)* +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_B_PACKED]], [[KERN_STRUCT_B_PACKED]] addrspace(4)* [[TMP1]], i32 0, i32 0 +; HSA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_struct_b_packed( +; MESA-NEXT: [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_STRUCT_B_PACKED:%.*]] addrspace(4)* +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_B_PACKED]], [[KERN_STRUCT_B_PACKED]] addrspace(4)* [[TMP2]], i32 0, i32 0 +; MESA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef +; MESA-NEXT: ret void +; + store %struct.b.packed %arg0, %struct.b.packed addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_implicit_arg_num_bytes(i32 %arg0) #1 { +; HSA-LABEL: @kern_implicit_arg_num_bytes( +; HSA-NEXT: [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]] to [[KERN_IMPLICIT_ARG_NUM_BYTES:%.*]] addrspace(4)* +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_IMPLICIT_ARG_NUM_BYTES]], [[KERN_IMPLICIT_ARG_NUM_BYTES]] addrspace(4)* [[TMP1]], i32 0, i32 0 +; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_implicit_arg_num_bytes( +; MESA-NEXT: [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(80) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_IMPLICIT_ARG_NUM_BYTES:%.*]] addrspace(4)* +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_IMPLICIT_ARG_NUM_BYTES]], [[KERN_IMPLICIT_ARG_NUM_BYTES]] addrspace(4)* [[TMP2]], i32 0, i32 0 +; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef +; MESA-NEXT: ret void +; + store i32 %arg0, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_lds_ptr(i32 addrspace(3)* %lds) #0 { +; HSA-LABEL: @kern_lds_ptr( +; HSA-NEXT: [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]] to [[KERN_LDS_PTR:%.*]] addrspace(4)* +; HSA-NEXT: [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_LDS_PTR]], [[KERN_LDS_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0 +; HSA-NEXT: [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_lds_ptr( +; MESA-NEXT: [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_LDS_PTR:%.*]] addrspace(4)* +; MESA-NEXT: [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_LDS_PTR]], [[KERN_LDS_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0 +; MESA-NEXT: [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4 +; MESA-NEXT: ret void +; + store i32 0, i32 addrspace(3)* %lds, align 4 + ret void +} + +define amdgpu_kernel void @kern_lds_ptr_si(i32 addrspace(3)* %lds) #2 { +; HSA-LABEL: @kern_lds_ptr_si( +; HSA-NEXT: [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_LDS_PTR_SI_KERNARG_SEGMENT]] to [[KERN_LDS_PTR_SI:%.*]] addrspace(4)* +; HSA-NEXT: store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_lds_ptr_si( +; MESA-NEXT: [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_SI_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_LDS_PTR_SI:%.*]] addrspace(4)* +; MESA-NEXT: store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4 +; MESA-NEXT: ret void +; + store i32 0, i32 addrspace(3)* %lds, align 4 + ret void +} + +define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 { +; HSA-LABEL: @kern_realign_i8_i8( +; HSA-NEXT: [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 +; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 +; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_realign_i8_i8( +; MESA-NEXT: [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 +; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 +; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i8 %arg0, i8 addrspace(1)* undef + store volatile i8 %arg1, i8 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #0 { +; HSA-LABEL: @kern_realign_i8_i8_i8( +; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 +; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 +; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16 +; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8 +; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP12]], i8 addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_realign_i8_i8_i8( +; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 +; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 +; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16 +; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8 +; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP13]], i8 addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i8 %arg0, i8 addrspace(1)* undef + store volatile i8 %arg1, i8 addrspace(1)* undef + store volatile i8 %arg2, i8 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) #0 { +; HSA-LABEL: @kern_realign_i8_i8_i8_i8( +; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8_I8:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 +; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 +; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16 +; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8 +; HSA-NEXT: [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 24 +; HSA-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i8 +; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP12]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP16]], i8 addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_realign_i8_i8_i8_i8( +; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8_I8:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 +; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 +; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16 +; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8 +; MESA-NEXT: [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP15]], 24 +; MESA-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP13]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP17]], i8 addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i8 %arg0, i8 addrspace(1)* undef + store volatile i8 %arg1, i8 addrspace(1)* undef + store volatile i8 %arg2, i8 addrspace(1)* undef + store volatile i8 %arg3, i8 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 { +; HSA-LABEL: @kern_realign_i8_v3i8( +; HSA-NEXT: [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_V3I8:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 4 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i24 +; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP7]] to <3 x i8> +; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_realign_i8_v3i8( +; MESA-NEXT: [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_V3I8:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i24 +; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP8]] to <3 x i8> +; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i8 %arg0, i8 addrspace(1)* undef + store volatile <3 x i8> %arg1, <3 x i8> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 { +; HSA-LABEL: @kern_realign_i8_i16( +; HSA-NEXT: [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I16:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 +; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16 +; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i16 [[TMP8]], i16 addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_realign_i8_i16( +; MESA-NEXT: [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I16:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 16 +; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i16 [[TMP9]], i16 addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i8 %arg0, i8 addrspace(1)* undef + store volatile i16 %arg1, i16 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 { +; HSA-LABEL: @kern_realign_i1_i1( +; HSA-NEXT: [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1 +; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 +; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1 +; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile i1 [[TMP8]], i1 addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_realign_i1_i1( +; MESA-NEXT: [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1 +; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 +; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1 +; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef +; MESA-NEXT: store volatile i1 [[TMP9]], i1 addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i1 %arg0, i1 addrspace(1)* undef + store volatile i1 %arg1, i1 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #0 { +; HSA-LABEL: @kern_realign_i1_i1_i1( +; HSA-NEXT: [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1_I1:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1 +; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 +; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1 +; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16 +; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i1 +; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile i1 [[TMP8]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile i1 [[TMP12]], i1 addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_realign_i1_i1_i1( +; MESA-NEXT: [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1_I1:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1 +; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 +; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1 +; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16 +; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i1 +; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef +; MESA-NEXT: store volatile i1 [[TMP9]], i1 addrspace(1)* undef +; MESA-NEXT: store volatile i1 [[TMP13]], i1 addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i1 %arg0, i1 addrspace(1)* undef + store volatile i1 %arg1, i1 addrspace(1)* undef + store volatile i1 %arg2, i1 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3) #0 { +; HSA-LABEL: @kern_realign_i1_i1_i1_i1( +; HSA-NEXT: [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1_I1_I1:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1 +; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 +; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1 +; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16 +; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i1 +; HSA-NEXT: [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 24 +; HSA-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i1 +; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile i1 [[TMP8]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile i1 [[TMP12]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile i1 [[TMP16]], i1 addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_realign_i1_i1_i1_i1( +; MESA-NEXT: [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1_I1_I1:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1 +; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 +; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1 +; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16 +; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i1 +; MESA-NEXT: [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP15]], 24 +; MESA-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i1 +; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef +; MESA-NEXT: store volatile i1 [[TMP9]], i1 addrspace(1)* undef +; MESA-NEXT: store volatile i1 [[TMP13]], i1 addrspace(1)* undef +; MESA-NEXT: store volatile i1 [[TMP17]], i1 addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i1 %arg0, i1 addrspace(1)* undef + store volatile i1 %arg1, i1 addrspace(1)* undef + store volatile i1 %arg2, i1 addrspace(1)* undef + store volatile i1 %arg3, i1 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 { +; HSA-LABEL: @kern_realign_i1_v3i1( +; HSA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_V3I1:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1 +; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 4 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3 +; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP7]] to <3 x i1> +; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_realign_i1_v3i1( +; MESA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_V3I1:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1 +; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i3 +; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP8]] to <3 x i1> +; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef +; MESA-NEXT: store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i1 %arg0, i1 addrspace(1)* undef + store volatile <3 x i1> %arg1, <3 x i1> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 { +; HSA-LABEL: @kern_realign_i1_i16( +; HSA-NEXT: [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I16:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1 +; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 +; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16 +; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile i16 [[TMP8]], i16 addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_realign_i1_i16( +; MESA-NEXT: [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I16:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1 +; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 16 +; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef +; MESA-NEXT: store volatile i16 [[TMP9]], i16 addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i1 %arg0, i1 addrspace(1)* undef + store volatile i16 %arg1, i16 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4, i8 %arg5, i8 %arg6, i8 %arg7) #0 { +; HSA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8( +; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 +; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 +; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16 +; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8 +; HSA-NEXT: [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 24 +; HSA-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i8 +; HSA-NEXT: [[TMP17:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4 +; HSA-NEXT: [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP17]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; HSA-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP18]], 8 +; HSA-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8 +; HSA-NEXT: [[TMP21:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4 +; HSA-NEXT: [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP21]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP22:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; HSA-NEXT: [[TMP23:%.*]] = lshr i32 [[TMP22]], 16 +; HSA-NEXT: [[TMP24:%.*]] = trunc i32 [[TMP23]] to i8 +; HSA-NEXT: [[TMP25:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4 +; HSA-NEXT: [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP25]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP26:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; HSA-NEXT: [[TMP27:%.*]] = lshr i32 [[TMP26]], 24 +; HSA-NEXT: [[TMP28:%.*]] = trunc i32 [[TMP27]] to i8 +; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP12]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP16]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP20]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP24]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP28]], i8 addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8( +; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 +; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 +; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16 +; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8 +; MESA-NEXT: [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP15]], 24 +; MESA-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +; MESA-NEXT: [[TMP18:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4 +; MESA-NEXT: [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP18]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP19:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP19]], 8 +; MESA-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 +; MESA-NEXT: [[TMP22:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4 +; MESA-NEXT: [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP22]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP23:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP24:%.*]] = lshr i32 [[TMP23]], 16 +; MESA-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i8 +; MESA-NEXT: [[TMP26:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4 +; MESA-NEXT: [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP26]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP27:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP28:%.*]] = lshr i32 [[TMP27]], 24 +; MESA-NEXT: [[TMP29:%.*]] = trunc i32 [[TMP28]] to i8 +; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP13]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP17]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP21]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP25]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP29]], i8 addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i8 %arg0, i8 addrspace(1)* undef + store volatile i8 %arg1, i8 addrspace(1)* undef + store volatile i8 %arg2, i8 addrspace(1)* undef + store volatile i8 %arg3, i8 addrspace(1)* undef + store volatile i8 %arg5, i8 addrspace(1)* undef + store volatile i8 %arg6, i8 addrspace(1)* undef + store volatile i8 %arg7, i8 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 { +; HSA-LABEL: @kern_realign_f16_f16( +; HSA-NEXT: [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]] to [[KERN_REALIGN_F16_F16:%.*]] addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 +; HSA-NEXT: [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP4]] to half +; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 +; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16 +; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP8]] to half +; HSA-NEXT: store volatile half [[ARG0_LOAD]], half addrspace(1)* undef +; HSA-NEXT: store volatile half [[ARG1_LOAD]], half addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_realign_f16_f16( +; MESA-NEXT: [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_F16_F16:%.*]] addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; MESA-NEXT: [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP5]] to half +; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 16 +; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP9]] to half +; MESA-NEXT: store volatile half [[ARG0_LOAD]], half addrspace(1)* undef +; MESA-NEXT: store volatile half [[ARG1_LOAD]], half addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile half %arg0, half addrspace(1)* undef + store volatile half %arg1, half addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 { +; HSA-LABEL: @kern_global_ptr( +; HSA-NEXT: [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR:%.*]] addrspace(4)* +; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR]], [[KERN_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0 +; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_global_ptr( +; MESA-NEXT: [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR:%.*]] addrspace(4)* +; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR]], [[KERN_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0 +; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* dereferenceable(42) %ptr) #0 { +; HSA-LABEL: @kern_global_ptr_dereferencable( +; HSA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE:%.*]] addrspace(4)* +; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE]], [[KERN_GLOBAL_PTR_DEREFERENCABLE]] addrspace(4)* [[TMP1]], i32 0, i32 0 +; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !dereferenceable !1 +; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_global_ptr_dereferencable( +; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE:%.*]] addrspace(4)* +; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE]], [[KERN_GLOBAL_PTR_DEREFERENCABLE]] addrspace(4)* [[TMP2]], i32 0, i32 0 +; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !dereferenceable !5 +; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1)* dereferenceable_or_null(128) %ptr) #0 { +; HSA-LABEL: @kern_global_ptr_dereferencable_or_null( +; HSA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL:%.*]] addrspace(4)* +; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]], [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]] addrspace(4)* [[TMP1]], i32 0, i32 0 +; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !dereferenceable_or_null !2 +; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_global_ptr_dereferencable_or_null( +; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL:%.*]] addrspace(4)* +; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]], [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]] addrspace(4)* [[TMP2]], i32 0, i32 0 +; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !dereferenceable_or_null !6 +; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr) #0 { +; HSA-LABEL: @kern_nonnull_global_ptr( +; HSA-NEXT: [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_NONNULL_GLOBAL_PTR:%.*]] addrspace(4)* +; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_NONNULL_GLOBAL_PTR]], [[KERN_NONNULL_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0 +; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !nonnull !0 +; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_nonnull_global_ptr( +; MESA-NEXT: [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NONNULL_GLOBAL_PTR:%.*]] addrspace(4)* +; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_NONNULL_GLOBAL_PTR]], [[KERN_NONNULL_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0 +; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !nonnull !0 +; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %ptr) #0 { +; HSA-LABEL: @kern_align32_global_ptr( +; HSA-NEXT: [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_ALIGN32_GLOBAL_PTR:%.*]] addrspace(4)* +; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_ALIGN32_GLOBAL_PTR]], [[KERN_ALIGN32_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0 +; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !align !3 +; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_align32_global_ptr( +; MESA-NEXT: [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ALIGN32_GLOBAL_PTR:%.*]] addrspace(4)* +; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_ALIGN32_GLOBAL_PTR]], [[KERN_ALIGN32_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0 +; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !align !7 +; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_noalias_global_ptr(i8 addrspace(1)* noalias %ptr) #0 { +; HSA-LABEL: @kern_noalias_global_ptr( +; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_NOALIAS_GLOBAL_PTR:%.*]] addrspace(4)* +; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_noalias_global_ptr( +; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NOALIAS_GLOBAL_PTR:%.*]] addrspace(4)* +; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_noalias_global_ptr_x2(i8 addrspace(1)* noalias %ptr0, i8 addrspace(1)* noalias %ptr1) #0 { +; HSA-LABEL: @kern_noalias_global_ptr_x2( +; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]] to [[KERN_NOALIAS_GLOBAL_PTR_X2:%.*]] addrspace(4)* +; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef +; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_noalias_global_ptr_x2( +; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NOALIAS_GLOBAL_PTR_X2:%.*]] addrspace(4)* +; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef +; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef +; MESA-NEXT: ret void +; + store volatile i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* undef + store volatile i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind "target-cpu"="kaveri" } +attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" } +attributes #2 = { nounwind "target-cpu"="tahiti" } + +; HSA: 0 = !{} +; HSA: !1 = !{i64 42} +; HSA: !2 = !{i64 128} +; HSA: !3 = !{i64 1024} + + +; MESA: !0 = !{} +; MESA: !1 = !{i32 0, i32 256} +; MESA: !2 = !{i32 0, i32 65536} +; MESA: !3 = !{i32 -128, i32 128} +; MESA: !4 = !{i32 -32768, i32 32768} +; MESA: !5 = !{i64 42} +; MESA: !6 = !{i64 128} +; MESA: !7 = !{i64 1024} Index: test/CodeGen/AMDGPU/lshr.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/lshr.v2i16.ll +++ test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,28 +8,14 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] - -; VI: s_load_dword [[LHS:s[0-9]+]] -; VI: s_load_dword [[RHS:s[0-9]+]] -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; VI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16 -; VI-DAG: s_lshl_b32 -; VI: v_or_b32_e32 - -; CI: s_load_dword s -; CI-NEXT: s_load_dword s -; CI-NOT: {{buffer|flat}} -; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} -; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 -; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 -; CI: s_and_b32 -; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; CI: s_and_b32 -; CI: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16 -; CI: s_lshl_b32 -; CI: v_or_b32_e32 +; CIVI: s_load_dword [[LHS:s[0-9]+]] +; CIVI: s_load_dword [[RHS:s[0-9]+]] +; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16 +; CIVI-DAG: s_lshl_b32 +; CIVI: v_or_b32_e32 define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = lshr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out Index: test/CodeGen/AMDGPU/madak.ll =================================================================== --- test/CodeGen/AMDGPU/madak.ll +++ test/CodeGen/AMDGPU/madak.ll @@ -206,14 +206,14 @@ ; SIFoldOperands should not fold the SGPR copy into the instruction ; because the implicit immediate already uses the constant bus. ; GCN-LABEL: {{^}}madak_constant_bus_violation: -; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} +; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} ; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] ; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]] ; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]] ; GFX6: buffer_store_dword [[MUL]] ; GFX8_9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]] -define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 { +define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 { bb: %tmp = icmp eq i32 %arg1, 0 br i1 %tmp, label %bb3, label %bb4 Index: test/CodeGen/AMDGPU/madmk.ll =================================================================== --- test/CodeGen/AMDGPU/madmk.ll +++ test/CodeGen/AMDGPU/madmk.ll @@ -83,7 +83,7 @@ ; GCN-NOT: v_madmk_f32 ; GCN: v_mac_f32_e32 ; GCN: s_endpgm -define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind { +define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, [8 x i32], float %a, [8 x i32], float %b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid Index: test/CodeGen/AMDGPU/max.ll =================================================================== --- test/CodeGen/AMDGPU/max.ll +++ test/CodeGen/AMDGPU/max.ll @@ -216,14 +216,14 @@ ; Make sure redundant and removed ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c ; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]] ; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] ; SI: buffer_store_dword [[VMAX]] ; EG: MAX_UINT -define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { +define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) nounwind { %a.ext = zext i16 %a to i32 %b.ext = zext i16 %b to i32 %cmp = icmp ugt i32 %a.ext, %b.ext @@ -236,14 +236,14 @@ ; Make sure redundant sign_extend_inreg removed. ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_max_slt_i16: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c ; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]] ; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] ; SI: buffer_store_dword [[VMAX]] ; EG: MAX_INT -define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { +define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) nounwind { %a.ext = sext i16 %a to i32 %b.ext = sext i16 %b to i32 %cmp = icmp sgt i32 %a.ext, %b.ext @@ -262,7 +262,7 @@ ; SI: s_max_i32 ; EG: MAX_INT -define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { +define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, [8 x i32], i16 %a, [8 x i32], i16 %b) nounwind { %cmp = icmp sge i16 %a, %b %val = select i1 %cmp, i16 %a, i16 %b store i16 %val, i16 addrspace(1)* %out Index: test/CodeGen/AMDGPU/min.ll =================================================================== --- test/CodeGen/AMDGPU/min.ll +++ test/CodeGen/AMDGPU/min.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s -; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_test_imin_sle_i32: ; GCN: v_min_i32_e32 @@ -65,16 +65,14 @@ ; GCN: s_sext_i32_i8 ; GCN: s_sext_i32_i8 ; GCN: s_min_i32 -define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 { +define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 { %cmp = icmp sle i8 %a, %b %val = select i1 %cmp, i8 %a, i8 %b store i8 %val, i8 addrspace(1)* %out ret void } -; XXX - should be able to use s_min if we stop unnecessarily doing -; extloads with mubuf instructions. - +; FIXME: Why vector and sdwa for last element? ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8: ; GCN: s_load_dword s ; GCN: s_load_dword s @@ -88,7 +86,7 @@ ; VI: s_min_i32 ; VI: s_min_i32 ; VI: s_min_i32 -; VI: s_min_i32 +; VI: v_min_i32_sdwa ; GFX9: v_min_i16 ; GFX9: v_min_i16 @@ -99,7 +97,7 @@ ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT -define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 { +define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 { %cmp = icmp sle <4 x i8> %a, %b %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b store <4 x i8> %val, <4 x i8> addrspace(1)* %out @@ -110,9 +108,9 @@ ; GCN: s_load_dword s ; GCN: s_load_dword s -; SI: s_ashr_i32 ; SI: s_ashr_i32 ; SI: s_sext_i32_i16 +; SI: s_ashr_i32 ; SI: s_sext_i32_i16 ; SI: s_min_i32 ; SI: s_min_i32 @@ -346,8 +344,8 @@ } ; FUNC-LABEL: {{^}}v_test_umin_ult_i8: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte +; SI: {{buffer|flat|global}}_load_ubyte +; SI: {{buffer|flat|global}}_load_ubyte ; SI: v_min_u32_e32 ; GFX89: {{flat|global}}_load_ubyte @@ -490,14 +488,14 @@ ; Make sure redundant and removed ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: -; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} +; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] ; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] ; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_UINT -define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 { +define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 { %a.ext = zext i16 %a to i32 %b.ext = zext i16 %b to i32 %cmp = icmp ult i32 %a.ext, %b.ext @@ -510,14 +508,17 @@ ; Make sure redundant sign_extend_inreg removed. ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: -; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} -; GCN: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]] +; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} +; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} +; GCN-DAG: s_sext_i32_i16 [[EXT_A:s[0-9]+]], [[A]] +; GCN-DAG: s_sext_i32_i16 [[EXT_B:s[0-9]+]], [[B]] + +; GCN: s_min_i32 [[MIN:s[0-9]+]], [[EXT_A]], [[EXT_B]] ; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] ; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_INT -define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 { +define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 { %a.ext = sext i16 %a to i32 %b.ext = sext i16 %b to i32 %cmp = icmp slt i32 %a.ext, %b.ext Index: test/CodeGen/AMDGPU/missing-store.ll =================================================================== --- test/CodeGen/AMDGPU/missing-store.ll +++ test/CodeGen/AMDGPU/missing-store.ll @@ -6,7 +6,7 @@ ; resulting in losing the store to gptr ; FUNC-LABEL: {{^}}missing_store_reduced: -; SI: s_load_dwordx2 +; SI: s_load_dwordx4 ; SI: ds_read_b64 ; SI-DAG: buffer_store_dword ; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} Index: test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll =================================================================== --- test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -19,7 +19,7 @@ ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]] ; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, -define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 { +define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, [8 x i32], i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 { bb: %tmp = icmp sgt i32 %arg3, 0 br i1 %tmp, label %bb4, label %bb17 Index: test/CodeGen/AMDGPU/mul.i16.ll =================================================================== --- test/CodeGen/AMDGPU/mul.i16.ll +++ test/CodeGen/AMDGPU/mul.i16.ll @@ -16,7 +16,8 @@ ; FIXME: Should emit scalar mul or maybe i16 v_mul here ; GCN-LABEL: {{^}}s_mul_i16: -; GCN: v_mul_u32_u24 +; SI: v_mul_u32_u24 +; VI: s_mul_i16 define amdgpu_kernel void @s_mul_i16(i16 %a, i16 %b) { %r.val = mul i16 %a, %b store volatile i16 %r.val, i16 addrspace(1)* null Index: test/CodeGen/AMDGPU/mul.ll =================================================================== --- test/CodeGen/AMDGPU/mul.ll +++ test/CodeGen/AMDGPU/mul.ll @@ -114,7 +114,7 @@ ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; GCN: buffer_store_dword [[VRESULT]], ; GCN: s_endpgm -define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind { %mul = mul i32 %a, %b store i32 %mul, i32 addrspace(1)* %out, align 4 ret void @@ -201,10 +201,8 @@ ; FIXME: Load dwordx4 ; FUNC-LABEL: {{^}}s_mul_i128: -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 +; GCN: s_load_dwordx4 +; GCN: s_load_dwordx4 ; SI: v_mul_hi_u32 ; SI: v_mul_hi_u32 @@ -220,18 +218,23 @@ ; SI-DAG: s_mul_i32 ; SI-DAG: v_mul_hi_u32 -; VI: s_mul_i32 ; VI: v_mul_hi_u32 ; VI: s_mul_i32 +; VI: s_mul_i32 +; VI: v_mul_hi_u32 ; VI: v_mul_hi_u32 +; VI: s_mul_i32 ; VI: v_mad_u64_u32 +; VI: s_mul_i32 ; VI: v_mad_u64_u32 +; VI: s_mul_i32 +; VI: s_mul_i32 ; VI: v_mad_u64_u32 - +; VI: s_mul_i32 ; GCN: buffer_store_dwordx4 -define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 { +define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 { %mul = mul i128 %a, %b store i128 %mul, i128 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/mul_int24.ll =================================================================== --- test/CodeGen/AMDGPU/mul_int24.ll +++ test/CodeGen/AMDGPU/mul_int24.ll @@ -70,7 +70,7 @@ ; GCN-DAG: v_mul_i32_i24_e32 ; GCN: buffer_store_dwordx2 -define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 { %shl.i = shl i32 %a, 8 %shr.i = ashr i32 %shl.i, 8 %conv.i = sext i32 %shr.i to i64 Index: test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll =================================================================== --- test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -18,8 +18,11 @@ } ; FUNC-LABEL: {{^}}test_umul24_i16_sext: -; GCN: v_mul_u32_u24_e{{(32|64)}} [[VI_MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; GCN: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16 +; SI: v_mul_u32_u24_e{{(32|64)}} [[VI_MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16 + +; VI: s_mul_i32 [[MUL:s[0-9]+]] +; VI: s_sext_i32_i16 s{{[0-9]+}}, [[MUL]] define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %mul = mul i16 %a, %b @@ -46,9 +49,12 @@ } ; FUNC-LABEL: {{^}}test_umul24_i16: -; GCN: s_and_b32 -; GCN: v_mul_u32_u24_e32 -; GCN: v_and_b32_e32 +; SI: s_and_b32 +; SI: v_mul_u32_u24_e32 +; SI: v_and_b32_e32 + +; VI: s_mul_i32 +; VI: s_and_b32 define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %mul = mul i16 %a, %b @@ -147,7 +153,7 @@ ; GCN-NOT: s_and_b32 ; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]] ; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]] -define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) { entry: %tmp0 = shl i64 %a, 40 %a.24 = lshr i64 %tmp0, 40 Index: test/CodeGen/AMDGPU/multi-divergent-exit-region.ll =================================================================== --- test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -70,14 +70,14 @@ ; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]] ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc -; GCN: ; %Flow1 +; GCN: ; %Flow4 ; GCN-NEXT: s_or_b64 exec, exec ; GCN: v_cmp_ne_u32_e32 vcc, 0 ; GCN: ; %exit1 ; GCN: ds_write_b32 -; GCN: %Flow2 +; GCN: %Flow5 ; GCN-NEXT: s_or_b64 exec, exec ; GCN: v_cmp_ne_u32_e32 vcc, 0 ; GCN-NEXT: s_and_saveexec_b64 Index: test/CodeGen/AMDGPU/no-shrink-extloads.ll =================================================================== --- test/CodeGen/AMDGPU/no-shrink-extloads.ll +++ test/CodeGen/AMDGPU/no-shrink-extloads.ll @@ -78,7 +78,7 @@ ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32: ; SI: s_load_dword s ; SI: buffer_store_dword v -define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind { %trunc = trunc i64 %arg to i32 store i32 %trunc, i32 addrspace(1)* %out ret void @@ -100,7 +100,7 @@ ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32: ; SI: s_load_dword s ; SI: buffer_store_dword v -define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { +define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind { %srl = lshr i64 %arg, 32 %trunc = trunc i64 %srl to i32 store i32 %trunc, i32 addrspace(1)* %out @@ -147,7 +147,7 @@ ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8: ; SI: s_load_dword s ; SI: buffer_store_byte v -define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { +define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind { %srl = lshr i64 %arg, 32 %trunc = trunc i64 %srl to i8 store i8 %trunc, i8 addrspace(1)* %out @@ -171,7 +171,7 @@ ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8: ; SI: s_load_dword s ; SI: buffer_store_byte v -define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind { %trunc = trunc i64 %arg to i8 store i8 %trunc, i8 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll =================================================================== --- test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll +++ test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll @@ -6,7 +6,7 @@ ; GCN: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] ; GCN: flat_load_dword v{{[0-9]+}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, i32 addrspace(1)* nocapture %arg1) { +define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, [8 x i32], i32 addrspace(1)* nocapture %arg1) { bb: %tmp18 = load volatile i32, i32 addrspace(1)* %arg, align 4 %tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 5 Index: test/CodeGen/AMDGPU/operand-spacing.ll =================================================================== --- test/CodeGen/AMDGPU/operand-spacing.ll +++ test/CodeGen/AMDGPU/operand-spacing.ll @@ -4,14 +4,18 @@ ; Make sure there isn't an extra space between the instruction name and first operands. ; GCN-LABEL: {{^}}add_f32: -; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]] -; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]] +; SI: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c +; SI: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI: v_mov_b32_e32 [[VREGA:v[0-9]+]], [[SREGA]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGB]], [[VREGA]] + +; VI: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c +; VI: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70 +; VI: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]] +; VI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]] + ; GCN: buffer_store_dword [[RESULT]], -define amdgpu_kernel void @add_f32(float addrspace(1)* %out, float %a, float %b) { +define amdgpu_kernel void @add_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) { %result = fadd float %a, %b store float %result, float addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/or.ll =================================================================== --- test/CodeGen/AMDGPU/or.ll +++ test/CodeGen/AMDGPU/or.ll @@ -63,26 +63,26 @@ } ; FUNC-LABEL: {{^}}scalar_or_literal_i64: -; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; SI-DAG: s_or_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b ; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]] ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]] -define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { %or = or i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}scalar_or_literal_multi_use_i64: -; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039 ; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] -define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { %or = or i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out @@ -92,7 +92,7 @@ } ; FUNC-LABEL: {{^}}scalar_or_inline_imm_i64: -; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; SI-NOT: or_b32 ; SI: s_or_b32 s[[VAL_LO]], s[[VAL_LO]], 63 ; SI-NOT: or_b32 @@ -101,7 +101,7 @@ ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]] ; SI-NOT: or_b32 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { %or = or i64 %a, 63 store i64 %or, i64 addrspace(1)* %out ret void @@ -125,7 +125,7 @@ ; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}} ; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]] ; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { %or = or i64 %a, -8 store i64 %or, i64 addrspace(1)* %out ret void @@ -239,7 +239,7 @@ ; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]] ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]] ; SI: buffer_store_dword [[VRESULT]], -define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { %add = or i64 %b, %a %trunc = trunc i64 %add to i32 store i32 %trunc, i32 addrspace(1)* %out, align 8 @@ -249,7 +249,7 @@ ; FUNC-LABEL: {{^}}or_i1: ; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}} -; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] +; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], vcc define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { %a = load float, float addrspace(1)* %in0 %b = load float, float addrspace(1)* %in1 Index: test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll =================================================================== --- test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -10,26 +10,26 @@ ; GCN-LABEL: {{^}}spill_sgprs_to_multiple_vgprs: -; GCN: def s[8:15] -; GCN: def s[16:23] -; GCN: def s[24:31] -; GCN: def s[32:39] -; GCN: def s[40:47] -; GCN: def s[48:55] -; GCN: def s[56:63] -; GCN: def s[64:71] -; GCN: def s[72:79] -; GCN: def s[80:87] -; GCN: def s[88:95] - -; GCN: v_writelane_b32 v0, s8, 0 -; GCN-NEXT: v_writelane_b32 v0, s9, 1 -; GCN-NEXT: v_writelane_b32 v0, s10, 2 -; GCN-NEXT: v_writelane_b32 v0, s11, 3 -; GCN-NEXT: v_writelane_b32 v0, s12, 4 -; GCN-NEXT: v_writelane_b32 v0, s13, 5 -; GCN-NEXT: v_writelane_b32 v0, s14, 6 -; GCN-NEXT: v_writelane_b32 v0, s15, 7 +; GCN: def s[4:11] +; GCN: def s[12:19] +; GCN: def s[20:27] +; GCN: def s[28:35] +; GCN: def s[36:43] +; GCN: def s[44:51] +; GCN: def s[52:59] +; GCN: def s[60:67] +; GCN: def s[68:75] +; GCN: def s[76:83] +; GCN: def s[84:91] + +; GCN: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 ; GCN: def s{{\[}}[[TMP_LO:[0-9]+]]:[[TMP_HI:[0-9]+]]{{\]}} ; GCN: v_writelane_b32 v0, s[[TMP_LO]], 8 @@ -37,8 +37,8 @@ ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 10 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 11 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 12 -; GCN-NEXT: v_writelane_b32 v0, s13, 13 -; GCN-NEXT: v_writelane_b32 v0, s14, 14 +; GCN-NEXT: v_writelane_b32 v0, s9, 13 +; GCN-NEXT: v_writelane_b32 v0, s10, 14 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 15 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} @@ -47,8 +47,8 @@ ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 18 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 19 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 20 -; GCN-NEXT: v_writelane_b32 v0, s13, 21 -; GCN-NEXT: v_writelane_b32 v0, s14, 22 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 23 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} @@ -57,8 +57,8 @@ ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 26 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 27 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 28 -; GCN-NEXT: v_writelane_b32 v0, s13, 29 -; GCN-NEXT: v_writelane_b32 v0, s14, 30 +; GCN-NEXT: v_writelane_b32 v0, s9, 29 +; GCN-NEXT: v_writelane_b32 v0, s10, 30 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 31 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} @@ -67,8 +67,8 @@ ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 34 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 35 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 36 -; GCN-NEXT: v_writelane_b32 v0, s13, 37 -; GCN-NEXT: v_writelane_b32 v0, s14, 38 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 39 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} @@ -77,8 +77,8 @@ ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 42 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 43 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 44 -; GCN-NEXT: v_writelane_b32 v0, s13, 45 -; GCN-NEXT: v_writelane_b32 v0, s14, 46 +; GCN-NEXT: v_writelane_b32 v0, s9, 45 +; GCN-NEXT: v_writelane_b32 v0, s10, 46 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} @@ -87,90 +87,90 @@ ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52 -; GCN-NEXT: v_writelane_b32 v0, s13, 53 -; GCN-NEXT: v_writelane_b32 v0, s14, 54 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55 -; GCN-NEXT: v_writelane_b32 v0, s88, 56 -; GCN-NEXT: v_writelane_b32 v0, s89, 57 -; GCN-NEXT: v_writelane_b32 v0, s90, 58 -; GCN-NEXT: v_writelane_b32 v0, s91, 59 -; GCN-NEXT: v_writelane_b32 v0, s92, 60 -; GCN-NEXT: v_writelane_b32 v0, s93, 61 -; GCN-NEXT: v_writelane_b32 v0, s94, 62 -; GCN-NEXT: v_writelane_b32 v0, s95, 63 -; GCN-NEXT: v_writelane_b32 v1, s16, 0 -; GCN-NEXT: v_writelane_b32 v1, s17, 1 -; GCN-NEXT: v_writelane_b32 v1, s18, 2 -; GCN-NEXT: v_writelane_b32 v1, s19, 3 -; GCN-NEXT: v_writelane_b32 v1, s20, 4 -; GCN-NEXT: v_writelane_b32 v1, s21, 5 -; GCN-NEXT: v_writelane_b32 v1, s22, 6 -; GCN-NEXT: v_writelane_b32 v1, s23, 7 -; GCN-NEXT: v_writelane_b32 v1, s24, 8 -; GCN-NEXT: v_writelane_b32 v1, s25, 9 -; GCN-NEXT: v_writelane_b32 v1, s26, 10 -; GCN-NEXT: v_writelane_b32 v1, s27, 11 -; GCN-NEXT: v_writelane_b32 v1, s28, 12 -; GCN-NEXT: v_writelane_b32 v1, s29, 13 -; GCN-NEXT: v_writelane_b32 v1, s30, 14 -; GCN-NEXT: v_writelane_b32 v1, s31, 15 -; GCN-NEXT: v_writelane_b32 v1, s32, 16 -; GCN-NEXT: v_writelane_b32 v1, s33, 17 -; GCN-NEXT: v_writelane_b32 v1, s34, 18 -; GCN-NEXT: v_writelane_b32 v1, s35, 19 -; GCN-NEXT: v_writelane_b32 v1, s36, 20 -; GCN-NEXT: v_writelane_b32 v1, s37, 21 -; GCN-NEXT: v_writelane_b32 v1, s38, 22 -; GCN-NEXT: v_writelane_b32 v1, s39, 23 -; GCN-NEXT: v_writelane_b32 v1, s40, 24 -; GCN-NEXT: v_writelane_b32 v1, s41, 25 -; GCN-NEXT: v_writelane_b32 v1, s42, 26 -; GCN-NEXT: v_writelane_b32 v1, s43, 27 -; GCN-NEXT: v_writelane_b32 v1, s44, 28 -; GCN-NEXT: v_writelane_b32 v1, s45, 29 -; GCN-NEXT: v_writelane_b32 v1, s46, 30 -; GCN-NEXT: v_writelane_b32 v1, s47, 31 -; GCN-NEXT: v_writelane_b32 v1, s48, 32 -; GCN-NEXT: v_writelane_b32 v1, s49, 33 -; GCN-NEXT: v_writelane_b32 v1, s50, 34 -; GCN-NEXT: v_writelane_b32 v1, s51, 35 -; GCN-NEXT: v_writelane_b32 v1, s52, 36 -; GCN-NEXT: v_writelane_b32 v1, s53, 37 -; GCN-NEXT: v_writelane_b32 v1, s54, 38 -; GCN-NEXT: v_writelane_b32 v1, s55, 39 -; GCN-NEXT: v_writelane_b32 v1, s56, 40 -; GCN-NEXT: v_writelane_b32 v1, s57, 41 -; GCN-NEXT: v_writelane_b32 v1, s58, 42 -; GCN-NEXT: v_writelane_b32 v1, s59, 43 -; GCN-NEXT: v_writelane_b32 v1, s60, 44 -; GCN-NEXT: v_writelane_b32 v1, s61, 45 -; GCN-NEXT: v_writelane_b32 v1, s62, 46 -; GCN-NEXT: v_writelane_b32 v1, s63, 47 -; GCN-NEXT: v_writelane_b32 v1, s64, 48 -; GCN-NEXT: v_writelane_b32 v1, s65, 49 -; GCN-NEXT: v_writelane_b32 v1, s66, 50 -; GCN-NEXT: v_writelane_b32 v1, s67, 51 -; GCN-NEXT: v_writelane_b32 v1, s68, 52 -; GCN-NEXT: v_writelane_b32 v1, s69, 53 -; GCN-NEXT: v_writelane_b32 v1, s70, 54 -; GCN-NEXT: v_writelane_b32 v1, s71, 55 -; GCN-NEXT: v_writelane_b32 v1, s72, 56 -; GCN-NEXT: v_writelane_b32 v1, s73, 57 -; GCN-NEXT: v_writelane_b32 v1, s74, 58 -; GCN-NEXT: v_writelane_b32 v1, s75, 59 -; GCN-NEXT: v_writelane_b32 v1, s76, 60 -; GCN-NEXT: v_writelane_b32 v1, s77, 61 -; GCN-NEXT: v_writelane_b32 v1, s78, 62 -; GCN-NEXT: v_writelane_b32 v1, s79, 63 -; GCN-NEXT: v_writelane_b32 v2, s80, 0 -; GCN-NEXT: v_writelane_b32 v2, s81, 1 -; GCN-NEXT: v_writelane_b32 v2, s82, 2 -; GCN-NEXT: v_writelane_b32 v2, s83, 3 -; GCN-NEXT: v_writelane_b32 v2, s84, 4 -; GCN-NEXT: v_writelane_b32 v2, s85, 5 -; GCN-NEXT: v_writelane_b32 v2, s86, 6 -; GCN-NEXT: v_writelane_b32 v2, s87, 7 +; GCN-NEXT: v_writelane_b32 v0, s84, 56 +; GCN-NEXT: v_writelane_b32 v0, s85, 57 +; GCN-NEXT: v_writelane_b32 v0, s86, 58 +; GCN-NEXT: v_writelane_b32 v0, s87, 59 +; GCN-NEXT: v_writelane_b32 v0, s88, 60 +; GCN-NEXT: v_writelane_b32 v0, s89, 61 +; GCN-NEXT: v_writelane_b32 v0, s90, 62 +; GCN-NEXT: v_writelane_b32 v0, s91, 63 +; GCN-NEXT: v_writelane_b32 v1, s12, 0 +; GCN-NEXT: v_writelane_b32 v1, s13, 1 +; GCN-NEXT: v_writelane_b32 v1, s14, 2 +; GCN-NEXT: v_writelane_b32 v1, s15, 3 +; GCN-NEXT: v_writelane_b32 v1, s16, 4 +; GCN-NEXT: v_writelane_b32 v1, s17, 5 +; GCN-NEXT: v_writelane_b32 v1, s18, 6 +; GCN-NEXT: v_writelane_b32 v1, s19, 7 +; GCN-NEXT: v_writelane_b32 v1, s20, 8 +; GCN-NEXT: v_writelane_b32 v1, s21, 9 +; GCN-NEXT: v_writelane_b32 v1, s22, 10 +; GCN-NEXT: v_writelane_b32 v1, s23, 11 +; GCN-NEXT: v_writelane_b32 v1, s24, 12 +; GCN-NEXT: v_writelane_b32 v1, s25, 13 +; GCN-NEXT: v_writelane_b32 v1, s26, 14 +; GCN-NEXT: v_writelane_b32 v1, s27, 15 +; GCN-NEXT: v_writelane_b32 v1, s28, 16 +; GCN-NEXT: v_writelane_b32 v1, s29, 17 +; GCN-NEXT: v_writelane_b32 v1, s30, 18 +; GCN-NEXT: v_writelane_b32 v1, s31, 19 +; GCN-NEXT: v_writelane_b32 v1, s32, 20 +; GCN-NEXT: v_writelane_b32 v1, s33, 21 +; GCN-NEXT: v_writelane_b32 v1, s34, 22 +; GCN-NEXT: v_writelane_b32 v1, s35, 23 +; GCN-NEXT: v_writelane_b32 v1, s36, 24 +; GCN-NEXT: v_writelane_b32 v1, s37, 25 +; GCN-NEXT: v_writelane_b32 v1, s38, 26 +; GCN-NEXT: v_writelane_b32 v1, s39, 27 +; GCN-NEXT: v_writelane_b32 v1, s40, 28 +; GCN-NEXT: v_writelane_b32 v1, s41, 29 +; GCN-NEXT: v_writelane_b32 v1, s42, 30 +; GCN-NEXT: v_writelane_b32 v1, s43, 31 +; GCN-NEXT: v_writelane_b32 v1, s44, 32 +; GCN-NEXT: v_writelane_b32 v1, s45, 33 +; GCN-NEXT: v_writelane_b32 v1, s46, 34 +; GCN-NEXT: v_writelane_b32 v1, s47, 35 +; GCN-NEXT: v_writelane_b32 v1, s48, 36 +; GCN-NEXT: v_writelane_b32 v1, s49, 37 +; GCN-NEXT: v_writelane_b32 v1, s50, 38 +; GCN-NEXT: v_writelane_b32 v1, s51, 39 +; GCN-NEXT: v_writelane_b32 v1, s52, 40 +; GCN-NEXT: v_writelane_b32 v1, s53, 41 +; GCN-NEXT: v_writelane_b32 v1, s54, 42 +; GCN-NEXT: v_writelane_b32 v1, s55, 43 +; GCN-NEXT: v_writelane_b32 v1, s56, 44 +; GCN-NEXT: v_writelane_b32 v1, s57, 45 +; GCN-NEXT: v_writelane_b32 v1, s58, 46 +; GCN-NEXT: v_writelane_b32 v1, s59, 47 +; GCN-NEXT: v_writelane_b32 v1, s60, 48 +; GCN-NEXT: v_writelane_b32 v1, s61, 49 +; GCN-NEXT: v_writelane_b32 v1, s62, 50 +; GCN-NEXT: v_writelane_b32 v1, s63, 51 +; GCN-NEXT: v_writelane_b32 v1, s64, 52 +; GCN-NEXT: v_writelane_b32 v1, s65, 53 +; GCN-NEXT: v_writelane_b32 v1, s66, 54 +; GCN-NEXT: v_writelane_b32 v1, s67, 55 +; GCN-NEXT: v_writelane_b32 v1, s68, 56 +; GCN-NEXT: v_writelane_b32 v1, s69, 57 +; GCN-NEXT: v_writelane_b32 v1, s70, 58 +; GCN-NEXT: v_writelane_b32 v1, s71, 59 +; GCN-NEXT: v_writelane_b32 v1, s72, 60 +; GCN-NEXT: v_writelane_b32 v1, s73, 61 +; GCN-NEXT: v_writelane_b32 v1, s74, 62 +; GCN-NEXT: v_writelane_b32 v1, s75, 63 +; GCN-NEXT: v_writelane_b32 v2, s76, 0 +; GCN-NEXT: v_writelane_b32 v2, s77, 1 +; GCN-NEXT: v_writelane_b32 v2, s78, 2 +; GCN-NEXT: v_writelane_b32 v2, s79, 3 +; GCN-NEXT: v_writelane_b32 v2, s80, 4 +; GCN-NEXT: v_writelane_b32 v2, s81, 5 +; GCN-NEXT: v_writelane_b32 v2, s82, 6 +; GCN-NEXT: v_writelane_b32 v2, s83, 7 ; GCN: s_cbranch_scc1 @@ -393,24 +393,25 @@ ; into the next available VGPR. ; GCN-LABEL: {{^}}split_sgpr_spill_2_vgprs: -; GCN: def s[24:39] - -; GCN: v_writelane_b32 v0, s24, 50 -; GCN-NEXT: v_writelane_b32 v0, s25, 51 -; GCN-NEXT: v_writelane_b32 v0, s26, 52 -; GCN-NEXT: v_writelane_b32 v0, s27, 53 -; GCN-NEXT: v_writelane_b32 v0, s28, 54 -; GCN-NEXT: v_writelane_b32 v0, s29, 55 -; GCN-NEXT: v_writelane_b32 v0, s30, 56 -; GCN-NEXT: v_writelane_b32 v0, s31, 57 -; GCN-NEXT: v_writelane_b32 v0, s32, 58 -; GCN-NEXT: v_writelane_b32 v0, s33, 59 -; GCN-NEXT: v_writelane_b32 v0, s34, 60 -; GCN-NEXT: v_writelane_b32 v0, s35, 61 -; GCN-NEXT: v_writelane_b32 v0, s36, 62 -; GCN-NEXT: v_writelane_b32 v0, s37, 63 -; GCN-NEXT: v_writelane_b32 v1, s38, 0 -; GCN-NEXT: v_writelane_b32 v1, s39, 1 +; GCN: def s[4:19] +; GCN: def s[20:35] + +; GCN: v_writelane_b32 v0, s4, 50 +; GCN-NEXT: v_writelane_b32 v0, s5, 51 +; GCN-NEXT: v_writelane_b32 v0, s6, 52 +; GCN-NEXT: v_writelane_b32 v0, s7, 53 +; GCN-NEXT: v_writelane_b32 v0, s8, 54 +; GCN-NEXT: v_writelane_b32 v0, s9, 55 +; GCN-NEXT: v_writelane_b32 v0, s10, 56 +; GCN-NEXT: v_writelane_b32 v0, s11, 57 +; GCN-NEXT: v_writelane_b32 v0, s12, 58 +; GCN-NEXT: v_writelane_b32 v0, s13, 59 +; GCN-NEXT: v_writelane_b32 v0, s14, 60 +; GCN-NEXT: v_writelane_b32 v0, s15, 61 +; GCN-NEXT: v_writelane_b32 v0, s16, 62 +; GCN-NEXT: v_writelane_b32 v0, s17, 63 +; GCN-NEXT: v_writelane_b32 v1, s18, 0 +; GCN-NEXT: v_writelane_b32 v1, s19, 1 ; GCN: v_readlane_b32 s4, v0, 50 ; GCN-NEXT: v_readlane_b32 s5, v0, 51 Index: test/CodeGen/AMDGPU/reduce-store-width-alignment.ll =================================================================== --- test/CodeGen/AMDGPU/reduce-store-width-alignment.ll +++ test/CodeGen/AMDGPU/reduce-store-width-alignment.ll @@ -40,8 +40,7 @@ ; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4: ; GCN: s_load_dword s -; GCN-NEXT: s_load_dword s -; GCN-NEXT: s_load_dword s +; GCN-NEXT: s_load_dwordx2 s ; GCN-NOT: {{buffer|flat|global}} ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} Index: test/CodeGen/AMDGPU/sad.ll =================================================================== --- test/CodeGen/AMDGPU/sad.ll +++ test/CodeGen/AMDGPU/sad.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}v_sad_u32_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -203,8 +203,11 @@ } ; GCN-LABEL: {{^}}v_sad_u32_i16_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) { +; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out) { + %a = load volatile i16, i16 addrspace(1)* undef + %b = load volatile i16, i16 addrspace(1)* undef + %c = load volatile i16, i16 addrspace(1)* undef %icmp0 = icmp ugt i16 %a, %b %sub0 = sub i16 %a, %b %sub1 = sub i16 %b, %a @@ -233,8 +236,31 @@ } ; GCN-LABEL: {{^}}v_sad_u32_i8_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { +; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out) { + %a = load volatile i8, i8 addrspace(1)* undef + %b = load volatile i8, i8 addrspace(1)* undef + %c = load volatile i8, i8 addrspace(1)* undef + %icmp0 = icmp ugt i8 %a, %b + %sub0 = sub i8 %a, %b + %sub1 = sub i8 %b, %a + %ret0 = select i1 %icmp0, i8 %sub0, i8 %sub1 + + %ret = add i8 %ret0, %c + + store i8 %ret, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_sad_u32_i8_pat2: +; GCN: s_load_dword +; GCN: s_bfe_u32 +; GCN: s_sub_i32 +; GCN: s_and_b32 +; GCN: s_sub_i32 +; GCN: s_lshr_b32 +; GCN: v_add_i32_e32 +define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { %icmp0 = icmp ugt i8 %a, %b %sub0 = sub i8 %a, %b %sub1 = sub i8 %b, %a Index: test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll +++ test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll @@ -2,14 +2,11 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s ; FUNC-LABEL: {{^}}cluster_arg_loads: -; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe -; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38 +; SI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd + +; VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { store i32 %x, i32 addrspace(1)* %out0, align 4 store i32 %y, i32 addrspace(1)* %out1, align 4 @@ -42,7 +39,7 @@ i64 %arg112, i64 %arg113, i64 %arg114, i64 %arg115, i64 %arg116, i64 %arg117, i64 %arg118, i64 %arg119, i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) { entry: - %value = add i64 %arg125, %arg126 + %value = add i64 %arg124, %arg126 store i64 %value, i64 addrspace(1)* %out, align 8 ret void } Index: test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -1,10 +1,13 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MINREG %s +; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MAXOCC %s +; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MINREG %s +; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MAXOCC %s -; SI: NumSgprs: {{[1-9]$}} -; SI: NumVgprs: {{[1-9]$}} +; SI-MINREG: NumSgprs: {{[1-9]$}} +; SI-MINREG: NumVgprs: {{[1-9]$}} + +; SI-MAXOCC: NumSgprs: {{[0-4][0-9]$}} +; SI-MAXOCC: NumVgprs: {{[0-4][0-9]$}} ; stores may alias loads ; VI: NumSgprs: {{[0-9]$}} Index: test/CodeGen/AMDGPU/select-i1.ll =================================================================== --- test/CodeGen/AMDGPU/select-i1.ll +++ test/CodeGen/AMDGPU/select-i1.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN @@ -18,12 +18,12 @@ ; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8 ; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16 ; GCN-DAG: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]] -; GCN-DAG: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]] -; GCN-DAG: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]] +; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]] +; GCN: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]] ; GCN: v_cmp_eq_u32_e64 vcc, [[COND]], 1 ; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]] ; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]] -define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind { +define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, [8 x i32], i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind { %cmp = icmp slt i1 %cond, false %sel = select i1 %cmp, i1 %a, i1 %b store i1 %sel, i1 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/select-opt.ll =================================================================== --- test/CodeGen/AMDGPU/select-opt.ll +++ test/CodeGen/AMDGPU/select-opt.ll @@ -134,8 +134,9 @@ } ; GCN-LABEL: {{^}}regression: -; GCN: v_cmp_neq_f32_e64 -; GCN: v_cmp_neq_f32_e64 {{[^,]*}}, s{{[0-9]+}}, 0 +; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0 +; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}} +; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}} ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 { Index: test/CodeGen/AMDGPU/select.f16.ll =================================================================== --- test/CodeGen/AMDGPU/select.f16.ll +++ test/CodeGen/AMDGPU/select.f16.ll @@ -24,10 +24,10 @@ half addrspace(1)* %c, half addrspace(1)* %d) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b - %c.val = load half, half addrspace(1)* %c - %d.val = load half, half addrspace(1)* %d + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b + %c.val = load volatile half, half addrspace(1)* %c + %d.val = load volatile half, half addrspace(1)* %d %fcmp = fcmp olt half %a.val, %b.val %r.val = select i1 %fcmp, half %c.val, half %d.val store half %r.val, half addrspace(1)* %r @@ -54,9 +54,9 @@ half addrspace(1)* %c, half addrspace(1)* %d) { entry: - %b.val = load half, half addrspace(1)* %b - %c.val = load half, half addrspace(1)* %c - %d.val = load half, half addrspace(1)* %d + %b.val = load volatile half, half addrspace(1)* %b + %c.val = load volatile half, half addrspace(1)* %c + %d.val = load volatile half, half addrspace(1)* %d %fcmp = fcmp olt half 0xH3800, %b.val %r.val = select i1 %fcmp, half %c.val, half %d.val store half %r.val, half addrspace(1)* %r @@ -84,9 +84,9 @@ half addrspace(1)* %c, half addrspace(1)* %d) { entry: - %a.val = load half, half addrspace(1)* %a - %c.val = load half, half addrspace(1)* %c - %d.val = load half, half addrspace(1)* %d + %a.val = load volatile half, half addrspace(1)* %a + %c.val = load volatile half, half addrspace(1)* %c + %d.val = load volatile half, half addrspace(1)* %d %fcmp = fcmp olt half %a.val, 0xH3800 %r.val = select i1 %fcmp, half %c.val, half %d.val store half %r.val, half addrspace(1)* %r @@ -115,9 +115,9 @@ half addrspace(1)* %b, half addrspace(1)* %d) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b - %d.val = load half, half addrspace(1)* %d + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b + %d.val = load volatile half, half addrspace(1)* %d %fcmp = fcmp olt half %a.val, %b.val %r.val = select i1 %fcmp, half 0xH3800, half %d.val store half %r.val, half addrspace(1)* %r @@ -145,9 +145,9 @@ half addrspace(1)* %b, half addrspace(1)* %c) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b - %c.val = load half, half addrspace(1)* %c + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b + %c.val = load volatile half, half addrspace(1)* %c %fcmp = fcmp olt half %a.val, %b.val %r.val = select i1 %fcmp, half %c.val, half 0xH3800 store half %r.val, half addrspace(1)* %r @@ -197,9 +197,9 @@ ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e32 vcc, 0.5 -; SI: v_cndmask_b32_e32 ; SI: v_cmp_gt_f32_e32 +; SI: v_cndmask_b32_e32 + ; SI: v_cmp_lt_f32_e32 vcc, 0.5 ; SI: v_cndmask_b32_e32 ; VI: v_cmp_lt_f16_e32 @@ -233,10 +233,10 @@ ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_gt_f32_e32 vcc, 0.5 -; SI: v_cndmask_b32_e32 ; SI: v_cmp_lt_f32_e32 ; SI: v_cndmask_b32_e32 +; SI: v_cmp_gt_f32_e32 vcc, 0.5 +; SI: v_cndmask_b32_e32 ; VI: v_cmp_gt_f16_e32 ; VI: v_cndmask_b32_e32 @@ -272,7 +272,7 @@ ; SI: v_cmp_nlt_f32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cmp_nlt_f32_e32 -; SI: v_cndmask_b32_e32 +; SI-DAG: v_cndmask_b32_e32 ; VI: v_cmp_nlt_f16_e32 ; VI: v_cndmask_b32_e32 @@ -280,8 +280,8 @@ ; VI: v_cmp_nlt_f16_e32 ; VI: v_cndmask_b32_e32 -; SI: v_cvt_f16_f32_e32 -; SI: v_cvt_f16_f32_e32 +; SI-DAG: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm define amdgpu_kernel void @select_v2f16_imm_c( <2 x half> addrspace(1)* %r, Index: test/CodeGen/AMDGPU/setcc-opt.ll =================================================================== --- test/CodeGen/AMDGPU/setcc-opt.ll +++ test/CodeGen/AMDGPU/setcc-opt.ll @@ -180,16 +180,14 @@ ret void } -; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_sext_arg: -; GCN: s_load_dword [[B:s[0-9]+]] -; GCN: v_cmp_ne_u32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[B]], -1{{$}} -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]] -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN: s_endpgm -define amdgpu_kernel void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind { +; FUNC-LABEL: {{^}}v_cmp_sext_k_neg1_i8_sext_arg: +; GCN: v_cmp_ne_u32_e32 vcc, -1, v0 +; GCN-NEXT: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, 1, vcc +; GCN: buffer_store_byte [[SELECT]] +define void @v_cmp_sext_k_neg1_i8_sext_arg(i8 signext %b) nounwind { %b.ext = sext i8 %b to i32 %icmp0 = icmp ne i32 %b.ext, -1 - store i1 %icmp0, i1 addrspace(1)* %out + store i1 %icmp0, i1 addrspace(1)* undef ret void } Index: test/CodeGen/AMDGPU/sgpr-control-flow.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -38,37 +38,37 @@ ; SI: s_cbranch_scc0 [[IF:BB[0-9]+_[0-9]+]] ; SI: ; %bb.1: ; %else -; SI: s_load_dword [[LOAD0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xe -; SI: s_load_dword [[LOAD1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xf +; SI: s_load_dword [[LOAD0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25 +; SI: s_load_dword [[LOAD1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2e ; SI-NOT: add ; SI: s_branch [[ENDIF:BB[0-9]+_[0-9]+]] ; SI: [[IF]]: ; %if -; SI: s_load_dword [[LOAD0]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_load_dword [[LOAD1]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI: s_load_dword [[LOAD0]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI: s_load_dword [[LOAD1]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c ; SI-NOT: add ; SI: [[ENDIF]]: ; %endif ; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]] ; SI: buffer_store_dword ; SI-NEXT: s_endpgm -define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) { entry: - %0 = icmp eq i32 %a, 0 - br i1 %0, label %if, label %else + %cmp0 = icmp eq i32 %a, 0 + br i1 %cmp0, label %if, label %else if: - %1 = add i32 %b, %c + %add0 = add i32 %b, %c br label %endif else: - %2 = add i32 %d, %e + %add1 = add i32 %d, %e br label %endif endif: - %3 = phi i32 [%1, %if], [%2, %else] - %4 = add i32 %3, %a - store i32 %4, i32 addrspace(1)* %out + %phi = phi i32 [%add0, %if], [%add1, %else] + %add2 = add i32 %phi, %a + store i32 %add2, i32 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/shl.ll =================================================================== --- test/CodeGen/AMDGPU/shl.ll +++ test/CodeGen/AMDGPU/shl.ll @@ -275,11 +275,11 @@ ; Make sure load width gets reduced to i32 load. ; GCN-LABEL: {{^}}s_shl_32_i64: -; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}} +; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { %result = shl i64 %a, 32 store i64 %result, i64 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/shl.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/shl.v2i16.ll +++ test/CodeGen/AMDGPU/shl.v2i16.ll @@ -14,10 +14,12 @@ ; VI: s_lshr_b32 ; VI: s_and_b32 ; VI: s_and_b32 +; VI: s_lshl_b32 +; VI: s_lshl_b32 +; VI: s_lshl_b32 ; VI: s_and_b32 ; VI: s_or_b32 - ; CI: s_load_dword s ; CI: s_load_dword s ; CI: s_lshr_b32 Index: test/CodeGen/AMDGPU/shl_add_constant.ll =================================================================== --- test/CodeGen/AMDGPU/shl_add_constant.ll +++ test/CodeGen/AMDGPU/shl_add_constant.ll @@ -54,14 +54,13 @@ } ; FUNC-LABEL: {{^}}test_add_shl_add_constant: -; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI-DAG: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3 -; SI: s_add_i32 [[RESULT:s[0-9]+]], [[SHL3]], [[Y]] +; SI-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Y:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI-DAG: s_lshl_b32 [[SHL3:s[0-9]+]], s[[X]], 3 +; SI: s_add_i32 [[RESULT:s[0-9]+]], [[SHL3]], s[[Y]] ; SI: s_addk_i32 [[RESULT]], 0x3d8 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] ; SI: buffer_store_dword [[VRESULT]] -define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, [8 x i32], i32 %x, i32 %y) #0 { %add.0 = add i32 %x, 123 %shl = shl i32 %add.0, 3 %add.1 = add i32 %shl, %y @@ -70,15 +69,14 @@ } ; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv: -; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3 -; SI: s_add_i32 [[TMP:s[0-9]+]], [[Y]], [[SHL3]] +; SI-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Y:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13 +; SI: s_lshl_b32 [[SHL3:s[0-9]+]], s[[X]], 3 +; SI: s_add_i32 [[TMP:s[0-9]+]], s[[Y]], [[SHL3]] ; SI: s_addk_i32 [[TMP]], 0x3d8 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]] ; SI: buffer_store_dword [[VRESULT]] -define amdgpu_kernel void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, [8 x i32], i32 %x, i32 %y) #0 { %add.0 = add i32 %x, 123 %shl = shl i32 %add.0, 3 %add.1 = add i32 %y, %shl Index: test/CodeGen/AMDGPU/sign_extend.ll =================================================================== --- test/CodeGen/AMDGPU/sign_extend.ll +++ test/CodeGen/AMDGPU/sign_extend.ll @@ -55,6 +55,7 @@ } ; GCN-LABEL: {{^}}s_sext_i16_to_i64: +; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000 define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { %sext = sext i16 %a to i64 Index: test/CodeGen/AMDGPU/smed3.ll =================================================================== --- test/CodeGen/AMDGPU/smed3.ll +++ test/CodeGen/AMDGPU/smed3.ll @@ -370,7 +370,7 @@ ; GCN: s_sext_i32_i16 ; GCN: s_sext_i32_i16 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, [8 x i32], i16 %x, [8 x i32], i16 %y, [8 x i32], i16 %z) #1 { bb: %tmp0 = call i16 @smin16(i16 %x, i16 %y) %tmp1 = call i16 @smax16(i16 %x, i16 %y) @@ -385,7 +385,7 @@ ; GCN: s_sext_i32_i8 ; GCN: s_sext_i32_i8 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, [8 x i32], i8 %x, [8 x i32], i8 %y, [8 x i32], i8 %z) #1 { bb: %tmp0 = call i8 @smin8(i8 %x, i8 %y) %tmp1 = call i8 @smax8(i8 %x, i8 %y) Index: test/CodeGen/AMDGPU/sminmax.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.ll +++ test/CodeGen/AMDGPU/sminmax.ll @@ -193,7 +193,7 @@ ; GCN-DAG: s_min_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]] ; GCN-DAG: s_max_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]] -define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind { +define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32], i32 %val0, [8 x i32], i32 %val1) nounwind { %cond0 = icmp sgt i32 %val0, %val1 %sel0 = select i1 %cond0, i32 %val0, i32 %val1 %sel1 = select i1 %cond0, i32 %val1, i32 %val0 Index: test/CodeGen/AMDGPU/sminmax.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -8,34 +8,15 @@ ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 -; VI: s_sub_i32 -; VI: s_sub_i32 -; VI: s_max_i32 -; VI: s_max_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_and_b32 -; SI: s_or_b32 - -; CI-NOT: {{buffer|flat}}_load -; CI: s_load_dword s -; CI-NOT: {{buffer|flat}}_load -; CI: s_lshr_b32 -; CI: s_ashr_i32 -; CI: s_sext_i32_i16 -; CI: s_sub_i32 -; CI: s_sub_i32 -; CI: s_sext_i32_i16 -; CI: s_sext_i32_i16 -; CI: s_max_i32 -; CI: s_max_i32 -; CI: s_lshl_b32 -; CI: s_add_i32 -; CI: s_add_i32 -; CI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff -; CI: s_or_b32 - +; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; CIVI: s_sub_i32 +; CIVI: s_sub_i32 +; CIVI: s_max_i32 +; CIVI: s_max_i32 +; CIVI: s_add_i32 +; CIVI: s_add_i32 +; CIVI: s_and_b32 +; CIVI: s_or_b32 define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 { %neg = sub <2 x i16> zeroinitializer, %val %cond = icmp sgt <2 x i16> %val, %neg @@ -61,6 +42,17 @@ ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[TWO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NOT: v_and_b32 ; VI: v_or_b32_e32 + +; CI: buffer_load_dword v +; CI: v_lshrrev_b32_e32 +; CI: v_sub_i32_e32 +; CI: v_bfe_i32 +; CI: v_bfe_i32 +; CI: v_max_i32 +; CI: v_max_i32 +; CI: v_add_i32 +; CI: v_add_i32 +; CI: v_or_b32 define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %src, i32 %tid @@ -115,10 +107,8 @@ ; GFX9: s_load_dwordx2 s{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}, s[0:1], 0x2c ; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, s[[VAL0]] ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[VAL1]] - ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[VAL0]], [[SUB0]] ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[VAL1]], [[SUB1]] - ; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0] ; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0] define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 { Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -46,10 +46,10 @@ ; GCN-LABEL: {{^}}smrd3: ; FIXME: There are too many copies here because we don't fold immediates ; through REG_SEQUENCE -; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b +; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0x13 ; encoding: [0x13 ; TODO: Add VI checks ; GCN: s_endpgm -define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, [8 x i32], i32 addrspace(4)* %ptr) #0 { entry: %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296 %tmp1 = load i32, i32 addrspace(4)* %tmp Index: test/CodeGen/AMDGPU/sra.ll =================================================================== --- test/CodeGen/AMDGPU/sra.ll +++ test/CodeGen/AMDGPU/sra.ll @@ -231,11 +231,11 @@ } ; GCN-LABEL: {{^}}s_ashr_32_i64: -; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x14|0x50}} ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31 ; GCN: s_add_u32 s{{[0-9]+}}, s[[HI]], s{{[0-9]+}} ; GCN: s_addc_u32 s{{[0-9]+}}, s[[SHIFT]], s{{[0-9]+}} -define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { %result = ashr i64 %a, 32 %add = add i64 %result, %b store i64 %add, i64 addrspace(1)* %out @@ -258,11 +258,11 @@ } ; GCN-LABEL: {{^}}s_ashr_63_i64: -; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x14|0x50}} ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31 ; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}} ; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}} -define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { %result = ashr i64 %a, 63 %add = add i64 %result, %b store i64 %add, i64 addrspace(1)* %out Index: test/CodeGen/AMDGPU/srl.ll =================================================================== --- test/CodeGen/AMDGPU/srl.ll +++ test/CodeGen/AMDGPU/srl.ll @@ -189,11 +189,11 @@ ; Make sure load width gets reduced to i32 load. ; GCN-LABEL: {{^}}s_lshr_32_i64: -; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}} +; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x14{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { %result = lshr i64 %a, 32 store i64 %result, i64 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/store-weird-sizes.ll =================================================================== --- test/CodeGen/AMDGPU/store-weird-sizes.ll +++ test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -18,9 +18,9 @@ } ; GCN-LABEL: {{^}}local_store_i55: -; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6 -; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 -; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}} +; CIVI-DAG: ds_write_b8 v{{[0-9]+}}, v{{[0-9]+}} offset:6 +; CIVI-DAG: ds_write_b16 v{{[0-9]+}}, v{{[0-9]+}} offset:4 +; CIVI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+$}} ; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6 ; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 Index: test/CodeGen/AMDGPU/sub.ll =================================================================== --- test/CodeGen/AMDGPU/sub.ll +++ test/CodeGen/AMDGPU/sub.ll @@ -6,9 +6,9 @@ declare i32 @llvm.r600.read.tidig.x() readnone ; FUNC-LABEL: {{^}}s_sub_i32: -; GCN: s_load_dword [[A:s[0-9]+]] -; GCN: s_load_dword [[B:s[0-9]+]] -; GCN: s_sub_i32 s{{[0-9]+}}, [[A]], [[B]] +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}} +; GCN: s_sub_i32 s{{[0-9]+}}, s[[A]], s[[B]] define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %result = sub i32 %a, %b store i32 %result, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll =================================================================== --- test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -4,20 +4,23 @@ target triple="amdgcn--" ; CHECK-LABEL: foobar: -; CHECK: s_load_dword s2, s[0:1], 0x9 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK: v_mbcnt_lo_u32_b32_e64 +; CHECK: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc -; BB0_1: -; CHECK: s_load_dword s0, s[0:1], 0xa ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; BB0_2: +; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc + +; CHECK: BB0_1: +; CHECK-NEXT: ; kill: def $vgpr0_vgpr1 killed $sgpr2_sgpr3 killed $exec +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 + +; CHECK: BB0_2: ; CHECK: s_or_b64 exec, exec, s[2:3] -; CHECK-NEXT: s_mov_b32 s7, 0xf000 -; CHECK-NEXT: s_mov_b32 s6, -1 -; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; CHECK-NEXT: s_endpgm define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { entry: Index: test/CodeGen/AMDGPU/trunc-store-i1.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-store-i1.ll +++ test/CodeGen/AMDGPU/trunc-store-i1.ll @@ -1,37 +1,38 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s -; SI-LABEL: {{^}}global_truncstore_i32_to_i1: -; SI: s_load_dword [[LOAD:s[0-9]+]], -; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] -; SI: buffer_store_byte [[VREG]], +; GCN-LABEL: {{^}}global_truncstore_i32_to_i1: +; GCN: s_load_dword [[LOAD:s[0-9]+]], +; GCN: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] +; GCN: buffer_store_byte [[VREG]], define amdgpu_kernel void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind { %trunc = trunc i32 %val to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 ret void } -; SI-LABEL: {{^}}global_truncstore_i64_to_i1: -; SI: buffer_store_byte +; GCN-LABEL: {{^}}global_truncstore_i64_to_i1: +; GCN: buffer_store_byte define amdgpu_kernel void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind { %trunc = trunc i64 %val to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 ret void } -; SI-LABEL: {{^}}s_arg_global_truncstore_i16_to_i1: -; SI: s_load_dword [[LOAD:s[0-9]+]], -; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] -; SI: buffer_store_byte [[VREG]], +; FIXME: VGPR on VI +; GCN-LABEL: {{^}}s_arg_global_truncstore_i16_to_i1: +; GCN: s_load_dword [[LOAD:s[0-9]+]], +; GCN: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] +; GCN: buffer_store_byte [[VREG]], define amdgpu_kernel void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { %trunc = trunc i16 %val to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 ret void } -; SI-LABEL: {{^}}global_truncstore_i16_to_i1: +; GCN-LABEL: {{^}}global_truncstore_i16_to_i1: define amdgpu_kernel void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind { %add = add i16 %val0, %val1 %trunc = trunc i16 %add to i1 Index: test/CodeGen/AMDGPU/trunc.ll =================================================================== --- test/CodeGen/AMDGPU/trunc.ll +++ test/CodeGen/AMDGPU/trunc.ll @@ -1,10 +1,10 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG %s declare i32 @llvm.r600.read.tidig.x() nounwind readnone -define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, [8 x i32], i64 %in) { ; GCN-LABEL: {{^}}trunc_i64_to_i32_store: ; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]] @@ -28,7 +28,7 @@ ; SI: buffer_store_dword [[VSHL]] ; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]] -define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, [8 x i32], i64 %a) { %b = shl i64 %a, 2 %result = trunc i64 %b to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -94,12 +94,12 @@ } ; GCN-LABEL: {{^}}s_trunc_i64_to_i1: -; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]] ; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}} ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]] -define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) { +define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, [8 x i32], i64 %x) { %trunc = trunc i64 %x to i1 %sel = select i1 %trunc, i32 63, i32 -12 store i32 %sel, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/udivrem.ll =================================================================== --- test/CodeGen/AMDGPU/udivrem.ll +++ test/CodeGen/AMDGPU/udivrem.ll @@ -51,7 +51,7 @@ ; SI-DAG: v_cndmask_b32_e64 ; SI-NOT: v_and_b32 ; SI: s_endpgm -define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) { +define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1, [8 x i32], i32 %x, [8 x i32], i32 %y) { %result0 = udiv i32 %x, %y store i32 %result0, i32 addrspace(1)* %out0 %result1 = urem i32 %x, %y Index: test/CodeGen/AMDGPU/umed3.ll =================================================================== --- test/CodeGen/AMDGPU/umed3.ll +++ test/CodeGen/AMDGPU/umed3.ll @@ -368,7 +368,7 @@ ; GCN: s_and_b32 ; GCN: s_and_b32 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, [8 x i32], i16 %x, [8 x i32], i16 %y, [8 x i32], i16 %z) #1 { bb: %tmp0 = call i16 @umin16(i16 %x, i16 %y) %tmp1 = call i16 @umax16(i16 %x, i16 %y) @@ -383,7 +383,7 @@ ; GCN: s_and_b32 ; GCN: s_and_b32 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, [8 x i32], i8 %x, [8 x i32], i8 %y, [8 x i32], i8 %z) #1 { bb: %tmp0 = call i8 @umin8(i8 %x, i8 %y) %tmp1 = call i8 @umax8(i8 %x, i8 %y) Index: test/CodeGen/AMDGPU/unaligned-load-store.ll =================================================================== --- test/CodeGen/AMDGPU/unaligned-load-store.ll +++ test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -442,7 +442,7 @@ ; ALIGNED: buffer_load_ushort ; ALIGNED: buffer_load_ushort -; UNALIGNED: s_load_dwordx2 +; UNALIGNED: s_load_dwordx4 ; UNALIGNED: buffer_store_dwordx2 define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 { %v = load i64, i64 addrspace(4)* %p, align 2 Index: test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-cfg.ll +++ test/CodeGen/AMDGPU/uniform-cfg.ll @@ -195,7 +195,7 @@ ; GCN-LABEL: {{^}}uniform_if_else: ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]] +; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]] ; GCN: v_mov_b32_e32 [[IMM_REG:v[0-9]+]], 2 ; GCN: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]] @@ -248,10 +248,10 @@ } ; GCN-LABEL: {{^}}icmp_users_different_blocks: -; GCN: s_load_dword [[COND:s[0-9]+]] -; GCN: s_cmp_lt_i32 [[COND]], 1 +; GCN: s_load_dwordx2 s{{\[}}[[COND0:[0-9]+]]:[[COND1:[0-9]+]]{{\]}} +; GCN: s_cmp_lt_i32 s[[COND0]], 1 ; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]] -; GCN: v_cmp_gt_i32_e64 {{[^,]*}}, [[COND]], 0{{$}} +; GCN: v_cmp_gt_i32_e64 {{[^,]*}}, s[[COND1]], 0{{$}} ; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]] ; GCN: {{^}}[[EXIT]]: ; GCN: s_endpgm @@ -432,8 +432,7 @@ ; GCN-LABEL: {{^}}uniform_if_scc_i64_eq: ; VI-DAG: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 0 ; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 - -; SI: v_cmp_eq_u64_e64 +; SI-DAG: v_cmp_eq_u64_e64 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]] ; VI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]] @@ -465,7 +464,7 @@ ; VI-DAG: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 0 ; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 -; SI: v_cmp_ne_u64_e64 +; SI-DAG: v_cmp_ne_u64_e64 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]] ; VI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]] @@ -494,8 +493,8 @@ } ; GCN-LABEL: {{^}}uniform_if_scc_i64_sgt: -; GCN: s_mov_b32 [[S_VAL:s[0-9]+]], 0 -; GCN: v_cmp_gt_i64_e64 +; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0 +; GCN-DAG: v_cmp_gt_i64_e64 ; GCN: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]] ; Fall-through to the else Index: test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll =================================================================== --- test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -28,12 +28,10 @@ } ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b: -; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]] +; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[SGPR0]], s[[SGPR0]], [[VGPR1]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1 @@ -42,8 +40,7 @@ } ; GCN-LABEL: {{^}}test_use_s_v_s: -; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN-DAG: s_load_dwordx2 s{{\[}}[[SA:[0-9]+]]:[[SB:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; SI: buffer_load_dword [[VA0:v[0-9]+]] ; SI-NEXT: buffer_load_dword [[VA1:v[0-9]+]] @@ -53,11 +50,11 @@ ; VI-NEXT: buffer_load_dword [[VA1:v[0-9]+]] ; GCN-NOT: v_mov_b32 -; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[SB]] ; GCN-NOT: v_mov_b32 -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SA]], [[VA0]], [[VB]] -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SA]], [[VA1]], [[VB]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SA]], [[VA0]], [[VB]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SA]], [[VA1]], [[VB]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 { @@ -71,12 +68,10 @@ } ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a: -; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]] +; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[SGPR0]], [[VGPR1]], s[[SGPR0]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1 @@ -85,12 +80,10 @@ } ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a: -; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]] +; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], s[[SGPR0]], s[[SGPR0]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1 @@ -152,11 +145,11 @@ } ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s_x2: -; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR0]] -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VK]], [[VK]], [[SGPR1]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], s[[SGPR0]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VK]], [[VK]], s[[SGPR1]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] ; GCN: s_endpgm @@ -180,11 +173,11 @@ } ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k_x2: -; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VK]], [[VK]] -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR1]], [[VK]], [[VK]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VK]], [[VK]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR1]], [[VK]], [[VK]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] ; GCN: s_endpgm @@ -208,11 +201,11 @@ } ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k_x2: -; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VK]], [[VK]] -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR1]], [[VK]], [[VK]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VK]], [[VK]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR1]], [[VK]], [[VK]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] ; GCN: s_endpgm @@ -225,14 +218,14 @@ } ; GCN-LABEL: {{^}}test_s0_s1_k_f32: -; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000 -; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]] +; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], s[[SGPR1]] -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK0]] ; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000 -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK1]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] @@ -246,8 +239,8 @@ ; FIXME: Immediate in SGPRs just copied to VGPRs ; GCN-LABEL: {{^}}test_s0_s1_k_f64: -; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} +; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} +; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x1d|0x74}} ; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], 0x40900000 ; GCN-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0{{$}} @@ -261,7 +254,7 @@ ; GCN: buffer_store_dwordx2 [[RESULT0]] ; GCN: buffer_store_dwordx2 [[RESULT1]] -define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) #0 { %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1 %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1 store volatile double %fma0, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/v_cndmask.ll =================================================================== --- test/CodeGen/AMDGPU/v_cndmask.ll +++ test/CodeGen/AMDGPU/v_cndmask.ll @@ -42,12 +42,12 @@ ; (select (cmp (sgprX, constant)), constant, sgprZ) ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprZ_f32: -; GCN: s_load_dword [[X:s[0-9]+]] -; GCN: s_load_dword [[Z:s[0-9]+]] -; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 -; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}} +; GCN-DAG: v_cmp_nlg_f32_e64 vcc, s[[X]], 0 +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc -define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 { +define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext @@ -73,12 +73,11 @@ } ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprZ_f32: -; GCN-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dword [[Z:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} -; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 -; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} +; GCN-DAG: v_cmp_nlg_f32_e64 vcc, s[[X]], 0 +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc -define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 { +define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext @@ -315,10 +314,10 @@ ; Different types compared vs. selected ; GCN-LABEL: {{^}}fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: -; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dwordx2 +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000 +; GCN-DAG: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc Index: test/CodeGen/AMDGPU/v_mac_f16.ll =================================================================== --- test/CodeGen/AMDGPU/v_mac_f16.ll +++ test/CodeGen/AMDGPU/v_mac_f16.ll @@ -350,8 +350,8 @@ ; GCN-LABEL: {{^}}mac_v2f16_same_add: ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI-DAG: v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -390,8 +390,8 @@ ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -418,8 +418,8 @@ ; GCN-LABEL: {{^}}mac_v2f16_neg_b ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI-NOT: v_mac_f16 @@ -452,8 +452,8 @@ ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT2]] -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT5]] +; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} Index: test/CodeGen/AMDGPU/v_madak_f16.ll =================================================================== --- test/CodeGen/AMDGPU/v_madak_f16.ll +++ test/CodeGen/AMDGPU/v_madak_f16.ll @@ -35,9 +35,9 @@ half addrspace(1)* %b, half addrspace(1)* %c) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b - %c.val = load half, half addrspace(1)* %c + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b + %c.val = load volatile half, half addrspace(1)* %c %t0.val = fmul half %a.val, %b.val %t1.val = fmul half %a.val, %c.val Index: test/CodeGen/AMDGPU/xor.ll =================================================================== --- test/CodeGen/AMDGPU/xor.ll +++ test/CodeGen/AMDGPU/xor.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}xor_v2i32: @@ -40,9 +40,9 @@ ; FUNC-LABEL: {{^}}xor_i1: ; EG: XOR_INT {{\** *}}{{T[0-9]+\.[XYZW]}}, {{PS|PV\.[XYZW]}}, {{PS|PV\.[XYZW]}} -; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}} -; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}} -; SI: s_xor_b64 [[XOR:vcc]], [[CMP0]], [[CMP1]] +; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 1.0, {{v[0-9]+}} +; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}} +; SI: s_xor_b64 [[XOR:vcc]], [[CMP1]], [[CMP0]] ; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm @@ -173,26 +173,26 @@ } ; FUNC-LABEL: {{^}}scalar_xor_literal_i64: -; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; SI-DAG: s_xor_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b -; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039 +; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} +; SI-DAG: s_xor_b32 s[[RES_HI:[0-9]+]], s{{[0-9]+}}, 0xf237b +; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s{{[0-9]+}}, 0x3039 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]] ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]] -define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { %or = xor i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}scalar_xor_literal_multi_use_i64: -; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI: s_load_dwordx4 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039 ; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] -define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, i64 %b) { %or = xor i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out @@ -202,25 +202,25 @@ } ; FUNC-LABEL: {{^}}scalar_xor_inline_imm_i64: -; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; SI-NOT: xor_b32 -; SI: s_xor_b32 s[[VAL_LO]], s[[VAL_LO]], 63 +; SI: s_xor_b32 s[[VAL_LO]], s{{[0-9]+}}, 63 ; SI-NOT: xor_b32 -; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]] +; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s{{[0-9]+}} ; SI-NOT: xor_b32 -; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]] +; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s{{[0-9]+}} ; SI-NOT: xor_b32 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { %or = xor i64 %a, 63 store i64 %or, i64 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; SI: s_xor_b64 [[VAL]], [[VAL]], -8 -define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} +; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -8 +define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { %or = xor i64 %a, -8 store i64 %or, i64 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/zero_extend.ll =================================================================== --- test/CodeGen/AMDGPU/zero_extend.ll +++ test/CodeGen/AMDGPU/zero_extend.ll @@ -1,14 +1,14 @@ -; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s ; R600: {{^}}s_mad_zext_i32_to_i64: ; R600: MEM_RAT_CACHELESS STORE_RAW ; R600: MEM_RAT_CACHELESS STORE_RAW -; SI: {{^}}s_mad_zext_i32_to_i64: -; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}} -; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}} +; GCN: {{^}}s_mad_zext_i32_to_i64: +; GCN: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}} +; GCN: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}} define amdgpu_kernel void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 { entry: %tmp0 = mul i32 %a, %b @@ -18,8 +18,8 @@ ret void } -; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32 -; SI: v_cndmask_b32 +; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i32 +; GCN: v_cndmask_b32 define amdgpu_kernel void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %tmp0 = icmp eq i32 %a, %b @@ -28,17 +28,17 @@ ret void } -; SI-LABEL: {{^}}s_arg_zext_i1_to_i64: +; GCN-LABEL: {{^}}s_arg_zext_i1_to_i64: define amdgpu_kernel void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 { %ext = zext i1 %arg to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 ret void } -; SI-LABEL: {{^}}s_cmp_zext_i1_to_i64: -; SI: s_mov_b32 s{{[0-9]+}}, 0 -; SI: v_cmp_eq_u32 -; SI: v_cndmask_b32 +; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i64: +; GCN: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: v_cmp_eq_u32 +; GCN: v_cndmask_b32 define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp eq i32 %a, %b %ext = zext i1 %cmp to i64 @@ -46,10 +46,20 @@ ret void } -; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16 -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; SI: buffer_store_short [[RESULT]] -define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 { +; FIXME: Why different commute? +; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i16 +; GCN: s_load_dword [[A:s[0-9]+]] +; GCN: s_load_dword [[B:s[0-9]+]] + +; SI: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]] +; SI: v_cmp_eq_u32_e32 vcc, [[B]], [[V_A]] + +; VI: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]] +; VI: v_cmp_eq_u32_e32 vcc, [[A]], [[V_B]] + +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: buffer_store_short [[RESULT]] +define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 { %tmp0 = icmp eq i16 %a, %b %tmp1 = zext i1 %tmp0 to i16 store i16 %tmp1, i16 addrspace(1)* %out