Diff 107684

lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Show All 12 Lines
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "AMDGPU.h"		#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"		#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"		#include "AMDGPUTargetMachine.h"
#include "llvm/ADT/StringRef.h"		#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/DivergenceAnalysis.h"		#include "llvm/Analysis/DivergenceAnalysis.h"
		#include "llvm/Analysis/Loads.h"
#include "llvm/CodeGen/Passes.h"		#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"		#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"		#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"		#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"		#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"		#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"		#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"		#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/InstrTypes.h"		#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"		#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"		#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"		#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Operator.h"		#include "llvm/IR/Operator.h"
#include "llvm/IR/Type.h"		#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"		#include "llvm/IR/Value.h"
#include "llvm/Pass.h"		#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"		#include "llvm/Support/Casting.h"
		#include "AMDGPU.h"
		kzhuravlUnsubmitted Done Reply Inline Actions Included twice. kzhuravl: Included twice.
#include <cassert>		#include <cassert>
#include <iterator>		#include <iterator>

#define DEBUG_TYPE "amdgpu-codegenprepare"		#define DEBUG_TYPE "amdgpu-codegenprepare"

using namespace llvm;		using namespace llvm;

namespace {		namespace {

class AMDGPUCodeGenPrepare : public FunctionPass,		class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {		public InstVisitor<AMDGPUCodeGenPrepare, bool> {
const SISubtarget *ST = nullptr;		const SISubtarget *ST = nullptr;
DivergenceAnalysis *DA = nullptr;		DivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;		Module *Mod = nullptr;
bool HasUnsafeFPMath = false;		bool HasUnsafeFPMath = false;
		AMDGPUAS AMDGPUASI;

/// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to		/// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
/// binary operation \p V.		/// binary operation \p V.
///		///
/// \returns Binary operation \p V.		/// \returns Binary operation \p V.
/// \returns \p T's base element bit width.		/// \returns \p T's base element bit width.
unsigned getBaseElementBitWidth(const Type *T) const;		unsigned getBaseElementBitWidth(const Type *T) const;

▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	class AMDGPUCodeGenPrepare : public FunctionPass,
/// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the		/// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
/// shift amount is 32 minus \p I's base element bit width), and truncating		/// shift amount is 32 minus \p I's base element bit width), and truncating
/// the result of the shift operation back to \p I's original type.		/// the result of the shift operation back to \p I's original type.
///		///
/// \returns True.		/// \returns True.
bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;		bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;

public:		public:
static char ID;		static char ID;

AMDGPUCodeGenPrepare() : FunctionPass(ID) {}		AMDGPUCodeGenPrepare() : FunctionPass(ID) {}

bool visitFDiv(BinaryOperator &I);		bool visitFDiv(BinaryOperator &I);

		arsenmUnsubmitted Done Reply Inline Actions The comment explains too specifically what it is doing, it should be describing intent and why. arsenm: The comment explains too specifically what it is doing, it should be describing intent and why.
bool visitInstruction(Instruction &I) { return false; }		bool visitInstruction(Instruction &I) { return false; }
bool visitBinaryOperator(BinaryOperator &I);		bool visitBinaryOperator(BinaryOperator &I);
		bool visitLoadInst(LoadInst &I);
bool visitICmpInst(ICmpInst &I);		bool visitICmpInst(ICmpInst &I);
bool visitSelectInst(SelectInst &I);		bool visitSelectInst(SelectInst &I);

bool visitIntrinsicInst(IntrinsicInst &I);		bool visitIntrinsicInst(IntrinsicInst &I);
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);		bool visitBitreverseIntrinsicInst(IntrinsicInst &I);

bool doInitialization(Module &M) override;		bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;		bool runOnFunction(Function &F) override;
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	case Instruction::Mul:
return true;		return true;
case Instruction::Sub:		case Instruction::Sub:
return I.hasNoUnsignedWrap();		return I.hasNoUnsignedWrap();
default:		default:
return false;		return false;
}		}
}		}

bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {		bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
		arsenmUnsubmitted Done Reply Inline Actions This doesn't actually do the widening, so the name should be something like canWidenScalarExtLoad arsenm: This doesn't actually do the widening, so the name should be something like…
assert(needsPromotionToI32(I.getType()) &&		assert(needsPromotionToI32(I.getType()) &&
"I does not need promotion to i32");		"I does not need promotion to i32");

if (I.getOpcode() == Instruction::SDiv \|\|		if (I.getOpcode() == Instruction::SDiv \|\|
I.getOpcode() == Instruction::UDiv)		I.getOpcode() == Instruction::UDiv)
return false;		return false;

		arsenmUnsubmitted Done Reply Inline Actions This should really be I.isSimple(), in case it's atomic. arsenm: This should really be I.isSimple(), in case it's atomic.
IRBuilder<> Builder(&I);		IRBuilder<> Builder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());		Builder.SetCurrentDebugLocation(I.getDebugLoc());

Type *I32Ty = getI32Ty(Builder, I.getType());		Type *I32Ty = getI32Ty(Builder, I.getType());
Value *ExtOp0 = nullptr;		Value *ExtOp0 = nullptr;
Value *ExtOp1 = nullptr;		Value *ExtOp1 = nullptr;
Value *ExtRes = nullptr;		Value *ExtRes = nullptr;
Value *TruncRes = nullptr;		Value *TruncRes = nullptr;
▲ Show 20 Lines • Show All 194 Lines • ▼ Show 20 Lines	bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {

if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&		if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
DA->isUniform(&I))		DA->isUniform(&I))
Changed \|= promoteUniformOpToI32(I);		Changed \|= promoteUniformOpToI32(I);

return Changed;		return Changed;
}		}

		bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
		Type * Ty = I.getType();
		arsenmUnsubmitted Done Reply Inline Actions There's no point to having this since you never set it to anything else arsenm: There's no point to having this since you never set it to anything else
		arsenmUnsubmitted Done Reply Inline Actions No space after * arsenm: No space after *
		VectorType *VT = dyn_cast<VectorType>(Ty);
		const DataLayout &DL = I.getModule()->getDataLayout();
		arsenmUnsubmitted Done Reply Inline Actions M->getDataLayout() arsenm: M->getDataLayout()
		int TySize = DL.getTypeSizeInBits(Ty);
		arsenmUnsubmitted Done Reply Inline Actions This should be able to handle vectors. This should also use the DataLayout so it works for pointers arsenm: This should be able to handle vectors. This should also use the DataLayout so it works for…
		wdngAuthorUnsubmitted Done Reply Inline Actions This one (VT && VT->getBitWidth() < 32) is able to handle vectors with bitwidth < 32 or scalar (!VT). Are you saying to use DataLayout to handle the pointer dereferenceable issue? wdng: This one (VT && VT->getBitWidth() < 32) is able to handle vectors with bitwidth < 32 or scalar…

		if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
		arsenmUnsubmitted Done Reply Inline Actions Ideally this would handle vectors like <2 x i8> arsenm: Ideally this would handle vectors like <2 x i8>
		!I.isVolatile() && TySize < 32 && DA->isUniform(&I)) {
		arsenmUnsubmitted Done Reply Inline Actions The address space check should be first arsenm: The address space check should be first
		IRBuilder<> Builder(&I);
		arsenmUnsubmitted Done Reply Inline Actions getI32Ty is the wrong thing to use here arsenm: getI32Ty is the wrong thing to use here
		arsenmUnsubmitted Not Done Reply Inline Actions Move align check before DA. It would also be better to move the datalayout alignment check into a helper function checked here arsenm: Move align check before DA. It would also be better to move the datalayout alignment check into…
		Builder.SetCurrentDebugLocation(I.getDebugLoc());

		Type *I32Ty = Builder.getInt32Ty();
		arsenmUnsubmitted Done Reply Inline Actions The builder already has a getInt32Ty arsenm: The builder already has a getInt32Ty
		Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
		Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
		arsenmUnsubmitted Done Reply Inline Actions I.getPointerOperand arsenm: I.getPointerOperand
		Value *WidenLoad = Builder.CreateLoad(BitCast);;
		if (VT) {
		Type *IntNTy = Builder.getIntNTy(TySize);
		Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
		Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
		I.replaceAllUsesWith(ValOrig);
		I.eraseFromParent();
		return true;
		arsenmUnsubmitted Done Reply Inline Actions This should also skip volatile loads arsenm: This should also skip volatile loads
		}

		Value *TruncRes = Builder.CreateTrunc(WidenLoad, I.getType());
		I.replaceAllUsesWith(TruncRes);
		I.eraseFromParent();
		return true;
		arsenmUnsubmitted Done Reply Inline Actions You don't need an entire block of code that is mostly the same for the vector case. The non-vector case requires a bitcast as well. arsenm: You don't need an entire block of code that is mostly the same for the vector case. The non…
		}

		return false;
		}

bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {		bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
bool Changed = false;		bool Changed = false;

if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&		if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
DA->isUniform(&I))		DA->isUniform(&I))
Changed \|= promoteUniformOpToI32(I);		Changed \|= promoteUniformOpToI32(I);

return Changed;		return Changed;
▲ Show 20 Lines • Show All 73 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/load-constant-i1.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=FUNC %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=FUNC %s
	; RUN: llc -march=r600 -mcpu=cypress < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s			; RUN: llc -march=r600 -mcpu=cypress < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s

	; FUNC-LABEL: {{^}}constant_load_i1:			; FUNC-LABEL: {{^}}constant_load_i1:
	; GCN: buffer_load_ubyte			; GCN: s_load_dword
	; GCN: v_and_b32_e32 v{{[0-9]+}}, 1			; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
	; GCN: buffer_store_byte			; GCN: buffer_store_byte

	; EG: VTX_READ_8			; EG: VTX_READ_8
	; EG: AND_INT			; EG: AND_INT
	define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {			define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
	%load = load i1, i1 addrspace(2)* %in			%load = load i1, i1 addrspace(2)* %in
	store i1 %load, i1 addrspace(1)* %out			store i1 %load, i1 addrspace(1)* %out
	ret void			ret void
	▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines
	; FUNC-LABEL: {{^}}constant_load_v64i1:			; FUNC-LABEL: {{^}}constant_load_v64i1:
	define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {			define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
	%load = load <64 x i1>, <64 x i1> addrspace(2)* %in			%load = load <64 x i1>, <64 x i1> addrspace(2)* %in
	store <64 x i1> %load, <64 x i1> addrspace(1)* %out			store <64 x i1> %load, <64 x i1> addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32:			; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32:
	; GCN: buffer_load_ubyte			; GCN: s_load_dword
	; GCN: buffer_store_dword			; GCN: buffer_store_dword
	define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {			define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
	%a = load i1, i1 addrspace(2)* %in			%a = load i1, i1 addrspace(2)* %in
	%ext = zext i1 %a to i32			%ext = zext i1 %a to i32
	store i32 %ext, i32 addrspace(1)* %out			store i32 %ext, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_sextload_i1_to_i32:			; FUNC-LABEL: {{^}}constant_sextload_i1_to_i32:
	; GCN: buffer_load_ubyte			; GCN: s_load_dword
	; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}			; GCN: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
	; GCN: buffer_store_dword			; GCN: buffer_store_dword

	; EG: VTX_READ_8			; EG: VTX_READ_8
	; EG: BFE_INT			; EG: BFE_INT
	define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {			define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
	%a = load i1, i1 addrspace(2)* %in			%a = load i1, i1 addrspace(2)* %in
	%ext = sext i1 %a to i32			%ext = sext i1 %a to i32
	store i32 %ext, i32 addrspace(1)* %out			store i32 %ext, i32 addrspace(1)* %out
	▲ Show 20 Lines • Show All 124 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {			define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
	%load = load <64 x i1>, <64 x i1> addrspace(2)* %in			%load = load <64 x i1>, <64 x i1> addrspace(2)* %in
	%ext = sext <64 x i1> %load to <64 x i32>			%ext = sext <64 x i1> %load to <64 x i32>
	store <64 x i32> %ext, <64 x i32> addrspace(1)* %out			store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_zextload_i1_to_i64:			; FUNC-LABEL: {{^}}constant_zextload_i1_to_i64:
	; GCN-DAG: buffer_load_ubyte [[LOAD:v[0-9]+]],			; GCN-DAG: s_load_dword [[LOAD:s[0-9]+]],
	; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}			; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, [[LOAD]]
	; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]			; GCN-DAG: s_and_b32 [[LOAD]], [[LOAD]], 1
	; GCN: buffer_store_dwordx2			; GCN: buffer_store_dwordx2
	define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {			define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
	%a = load i1, i1 addrspace(2)* %in			%a = load i1, i1 addrspace(2)* %in
	%ext = zext i1 %a to i64			%ext = zext i1 %a to i64
	store i64 %ext, i64 addrspace(1)* %out			store i64 %ext, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_sextload_i1_to_i64:			; FUNC-LABEL: {{^}}constant_sextload_i1_to_i64:
	; GCN: buffer_load_ubyte [[LOAD:v[0-9]+]],			; GCN: s_load_dword [[LOAD:s[0-9]+]],
	; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}			; GCN: s_bfe_i64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x10000
	; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
	; GCN: buffer_store_dwordx2			; GCN: buffer_store_dwordx2
	define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {			define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
	%a = load i1, i1 addrspace(2)* %in			%a = load i1, i1 addrspace(2)* %in
	%ext = sext i1 %a to i64			%ext = sext i1 %a to i64
	store i64 %ext, i64 addrspace(1)* %out			store i64 %ext, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	▲ Show 20 Lines • Show All 129 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/load-constant-i16.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
	; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s			; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
	; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s			; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s

	; FUNC-LABEL: {{^}}constant_load_i16:			; FUNC-LABEL: {{^}}constant_load_i16:
	; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}}			; GCN-NOHSA: s_load_dword s{{[0-9]+}}
	; GCN-HSA: flat_load_ushort			; GCN-HSA: s_load_dword

	; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1			; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
	define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {			define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
	entry:			entry:
	%ld = load i16, i16 addrspace(2)* %in			%ld = load i16, i16 addrspace(2)* %in
	store i16 %ld, i16 addrspace(1)* %out			store i16 %ld, i16 addrspace(1)* %out
	ret void			ret void
	}			}
	▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {			define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
	entry:			entry:
	%ld = load <16 x i16>, <16 x i16> addrspace(2)* %in			%ld = load <16 x i16>, <16 x i16> addrspace(2)* %in
	store <16 x i16> %ld, <16 x i16> addrspace(1)* %out			store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_zextload_i16_to_i32:			; FUNC-LABEL: {{^}}constant_zextload_i16_to_i32:
	; GCN-NOHSA: buffer_load_ushort			; GCN-NOHSA: s_load_dword
	; GCN-NOHSA: buffer_store_dword			; GCN-NOHSA: buffer_store_dword

	; GCN-HSA: flat_load_ushort			; GCN-HSA: s_load_dword
	; GCN-HSA: flat_store_dword			; GCN-HSA: flat_store_dword

	; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1			; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
	define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
	%a = load i16, i16 addrspace(2)* %in			%a = load i16, i16 addrspace(2)* %in
	%ext = zext i16 %a to i32			%ext = zext i16 %a to i32
	store i32 %ext, i32 addrspace(1)* %out			store i32 %ext, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_sextload_i16_to_i32:			; FUNC-LABEL: {{^}}constant_sextload_i16_to_i32:
	; GCN-NOHSA: buffer_load_sshort			; GCN-NOHSA: s_load_dword
	; GCN-NOHSA: buffer_store_dword			; GCN-NOHSA: buffer_store_dword

	; GCN-HSA: flat_load_sshort			; GCN-HSA: s_load_dword
	; GCN-HSA: flat_store_dword			; GCN-HSA: flat_store_dword

	; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1			; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
	; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal			; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
	; EG: 16			; EG: 16
	define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
	%a = load i16, i16 addrspace(2)* %in			%a = load i16, i16 addrspace(2)* %in
	%ext = sext i16 %a to i32			%ext = sext i16 %a to i32
	store i32 %ext, i32 addrspace(1)* %out			store i32 %ext, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i32:			; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i32:
	; GCN-NOHSA: buffer_load_ushort			; GCN-NOHSA: s_load_dword
	; GCN-HSA: flat_load_ushort			; GCN-HSA: s_load_dword

	; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1			; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
	define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
	%load = load <1 x i16>, <1 x i16> addrspace(2)* %in			%load = load <1 x i16>, <1 x i16> addrspace(2)* %in
	%ext = zext <1 x i16> %load to <1 x i32>			%ext = zext <1 x i16> %load to <1 x i32>
	store <1 x i32> %ext, <1 x i32> addrspace(1)* %out			store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i32:			; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i32:
	; GCN-NOHSA: buffer_load_sshort			; GCN-NOHSA: s_load_dword
	; GCN-HSA: flat_load_sshort			; GCN-HSA: s_load_dword

	; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1			; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
	; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal			; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
	; EG: 16			; EG: 16
	define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
	%load = load <1 x i16>, <1 x i16> addrspace(2)* %in			%load = load <1 x i16>, <1 x i16> addrspace(2)* %in
	%ext = sext <1 x i16> %load to <1 x i32>			%ext = sext <1 x i16> %load to <1 x i32>
	store <1 x i32> %ext, <1 x i32> addrspace(1)* %out			store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
	▲ Show 20 Lines • Show All 294 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
	%load = load <64 x i16>, <64 x i16> addrspace(2)* %in			%load = load <64 x i16>, <64 x i16> addrspace(2)* %in
	%ext = sext <64 x i16> %load to <64 x i32>			%ext = sext <64 x i16> %load to <64 x i32>
	store <64 x i32> %ext, <64 x i32> addrspace(1)* %out			store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_zextload_i16_to_i64:			; FUNC-LABEL: {{^}}constant_zextload_i16_to_i64:
	; GCN-NOHSA-DAG: buffer_load_ushort v[[LO:[0-9]+]],			; GCN-NOHSA-DAG: s_load_dword s[[LO:[0-9]+]],
	; GCN-HSA-DAG: flat_load_ushort v[[LO:[0-9]+]],			; GCN-HSA-DAG: s_load_dword s[[LO:[0-9]+]],
	; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}			; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}

	; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]			; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
	; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}			; GCN-HSA: flat_store_dwordx2 v{{\[}}[[LO]]:{{[0-9]+}}{{\]}}, v{{\[}}{{[0-9]+}}:[[HI]]{{\]}}

	; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1			; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
	; EG: MOV {{.*}}, 0.0			; EG: MOV {{.*}}, 0.0
	define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
	%a = load i16, i16 addrspace(2)* %in			%a = load i16, i16 addrspace(2)* %in
	%ext = zext i16 %a to i64			%ext = zext i16 %a to i64
	store i64 %ext, i64 addrspace(1)* %out			store i64 %ext, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_sextload_i16_to_i64:			; FUNC-LABEL: {{^}}constant_sextload_i16_to_i64:
	; FIXME: Need to optimize this sequence to avoid extra bfe:			; FIXME: Need to optimize this sequence to avoid extra bfe:
	; t28: i32,ch = load<LD2[%in(addrspace=1)], anyext from i16> t12, t27, undef:i64			; t28: i32,ch = load<LD2[%in(addrspace=1)], anyext from i16> t12, t27, undef:i64
	; t31: i64 = any_extend t28			; t31: i64 = any_extend t28
	; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16			; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16

	; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]],			; GCN-NOHSA-SI-DAG: s_load_dword s[[LO:[0-9]+]],
	; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]],			; GCN-HSA-DAG: s_load_dword s[[LO:[0-9]+]],
	; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]],			; GCN-NOHSA-VI-DAG: s_load_dword s[[ULO:[0-9]+]],
	; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16			; GCN-NOHSA-VI-DAG: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000
	; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]

	; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]			; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
	; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}			; GCN-HSA: flat_store_dwordx2 v{{\[}}{{[0-9]+}}:[[HI]]{{\]}}, v{{\[}}[[LO]]:{{[0-9]+}}{{\]}}

	; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1			; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
	; EG: ASHR {{\*}} {{T[0-9]\.[XYZW]}}, {{.}}, literal			; EG: ASHR {{\*}} {{T[0-9]\.[XYZW]}}, {{.}}, literal
	; TODO: These could be expanded earlier using ASHR 15			; TODO: These could be expanded earlier using ASHR 15
	; EG: 31			; EG: 31
	define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
	%a = load i16, i16 addrspace(2)* %in			%a = load i16, i16 addrspace(2)* %in
	%ext = sext i16 %a to i64			%ext = sext i16 %a to i64
	▲ Show 20 Lines • Show All 155 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/load-constant-i8.ll

; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s		; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s		; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s		; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s		; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s


; FUNC-LABEL: {{^}}constant_load_i8:		; FUNC-LABEL: {{^}}constant_load_i8:
; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}		; GCN-NOHSA: s_load_dword s{{[0-9]+}}
; GCN-HSA: flat_load_ubyte		; GCN-HSA: s_load_dword

; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
; TODO: NOT AND		; TODO: NOT AND
define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
entry:		entry:
%ld = load i8, i8 addrspace(2)* %in		%ld = load i8, i8 addrspace(2)* %in
store i8 %ld, i8 addrspace(1)* %out		store i8 %ld, i8 addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_load_v2i8:		; FUNC-LABEL: {{^}}constant_load_v2i8:
; GCN-NOHSA: buffer_load_ushort v		; GCN-NOHSA: s_load_dword s
; GCN-HSA: flat_load_ushort v		; GCN-HSA: s_load_dword s

; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
entry:		entry:
%ld = load <2 x i8>, <2 x i8> addrspace(2)* %in		%ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
store <2 x i8> %ld, <2 x i8> addrspace(1)* %out		store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
ret void		ret void
}		}
Show All 38 Lines
define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
entry:		entry:
%ld = load <16 x i8>, <16 x i8> addrspace(2)* %in		%ld = load <16 x i8>, <16 x i8> addrspace(2)* %in
store <16 x i8> %ld, <16 x i8> addrspace(1)* %out		store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_zextload_i8_to_i32:		; FUNC-LABEL: {{^}}constant_zextload_i8_to_i32:
; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}},		; GCN-NOHSA: s_load_dword s{{[0-9]+}},
; GCN-HSA: flat_load_ubyte		; GCN-HSA: s_load_dword

; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%a = load i8, i8 addrspace(2)* %in		%a = load i8, i8 addrspace(2)* %in
%ext = zext i8 %a to i32		%ext = zext i8 %a to i32
store i32 %ext, i32 addrspace(1)* %out		store i32 %ext, i32 addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_sextload_i8_to_i32:		; FUNC-LABEL: {{^}}constant_sextload_i8_to_i32:
; GCN-NOHSA: buffer_load_sbyte		; GCN-NOHSA: s_load_dword
; GCN-HSA: flat_load_sbyte		; GCN-HSA: s_load_dword

; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal		; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
; EG: 8		; EG: 8
define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%ld = load i8, i8 addrspace(2)* %in		%ld = load i8, i8 addrspace(2)* %in
%ext = sext i8 %ld to i32		%ext = sext i8 %ld to i32
store i32 %ext, i32 addrspace(1)* %out		store i32 %ext, i32 addrspace(1)* %out
Show All 18 Lines
define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
%load = load <1 x i8>, <1 x i8> addrspace(2)* %in		%load = load <1 x i8>, <1 x i8> addrspace(2)* %in
%ext = sext <1 x i8> %load to <1 x i32>		%ext = sext <1 x i8> %load to <1 x i32>
store <1 x i32> %ext, <1 x i32> addrspace(1)* %out		store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i32:		; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i32:
; GCN-NOHSA: buffer_load_ushort		; GCN-NOHSA: s_load_dword
; GCN-HSA: flat_load_ushort		; GCN-HSA: s_load_dword

; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
; TODO: This should use DST, but for some there are redundant MOVs		; TODO: This should use DST, but for some there are redundant MOVs
; EG: BFE_UINT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, {{.*}}literal		; EG: BFE_UINT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, {{.*}}literal
; EG: 8		; EG: 8
define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
%load = load <2 x i8>, <2 x i8> addrspace(2)* %in		%load = load <2 x i8>, <2 x i8> addrspace(2)* %in
%ext = zext <2 x i8> %load to <2 x i32>		%ext = zext <2 x i8> %load to <2 x i32>
store <2 x i32> %ext, <2 x i32> addrspace(1)* %out		store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i32:		; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i32:
; GCN-NOHSA: buffer_load_ushort		; GCN-NOHSA: s_load_dwordx2

; GCN-HSA: flat_load_ushort		; GCN-HSA: s_load_dword

; GCN: v_bfe_i32		; GCN: s_sext_i32
; GCN: v_bfe_i32

; EG: VTX_READ_16 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_16 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
; TODO: These should use DST, but for some there are redundant MOVs		; TODO: These should use DST, but for some there are redundant MOVs
; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal		; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal
; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal		; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal
; EG-DAG: 8		; EG-DAG: 8
; EG-DAG: 8		; EG-DAG: 8
define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
%load = load <2 x i8>, <2 x i8> addrspace(2)* %in		%load = load <2 x i8>, <2 x i8> addrspace(2)* %in
%ext = sext <2 x i8> %load to <2 x i32>		%ext = sext <2 x i8> %load to <2 x i32>
store <2 x i32> %ext, <2 x i32> addrspace(1)* %out		store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_zextload_v3i8_to_v3i32:		; FUNC-LABEL: {{^}}constant_zextload_v3i8_to_v3i32:
; GCN: s_load_dword s		; GCN: s_load_dword s

; GCN-DAG: s_bfe_u32		; GCN-DAG: s_and_b32
; GCN-DAG: s_bfe_u32		; GCN-DAG: s_and_b32
; GCN-DAG: s_and_b32		; GCN-DAG: s_and_b32

; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
; TODO: These should use DST, but for some there are redundant MOVs		; TODO: These should use DST, but for some there are redundant MOVs
; EG-DAG: BFE_UINT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, {{.*}}literal		; EG-DAG: BFE_UINT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, {{.*}}literal
; EG-DAG: BFE_UINT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, {{.*}}literal		; EG-DAG: BFE_UINT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, {{.*}}literal
; EG-DAG: 8		; EG-DAG: 8
; EG-DAG: 8		; EG-DAG: 8
define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
entry:		entry:
%ld = load <3 x i8>, <3 x i8> addrspace(2)* %in		%ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
%ext = zext <3 x i8> %ld to <3 x i32>		%ext = zext <3 x i8> %ld to <3 x i32>
store <3 x i32> %ext, <3 x i32> addrspace(1)* %out		store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_sextload_v3i8_to_v3i32:		; FUNC-LABEL: {{^}}constant_sextload_v3i8_to_v3i32:
; GCN: s_load_dword s		; GCN: s_load_dword s

; GCN-DAG: s_bfe_i32		; GCN-DAG: s_sext_i32
; GCN-DAG: s_bfe_i32		; GCN-DAG: s_sext_i32
; GCN-DAG: s_bfe_i32		; GCN-DAG: s_sext_i32

; EG: VTX_READ_32 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_32 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
; TODO: These should use DST, but for some there are redundant MOVs		; TODO: These should use DST, but for some there are redundant MOVs
; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal		; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal
; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal		; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal
; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal		; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal
; EG-DAG: 8		; EG-DAG: 8
; EG-DAG: 8		; EG-DAG: 8
▲ Show 20 Lines • Show All 362 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
%ext = sext <64 x i8> %load to <64 x i32>		%ext = sext <64 x i8> %load to <64 x i32>
store <64 x i32> %ext, <64 x i32> addrspace(1)* %out		store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_zextload_i8_to_i64:		; FUNC-LABEL: {{^}}constant_zextload_i8_to_i64:
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}		; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}

; GCN-NOHSA-DAG: buffer_load_ubyte v[[LO:[0-9]+]],		; GCN-NOHSA-DAG: s_load_dword s[[LO:[0-9]+]],
; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]		; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]

; GCN-HSA-DAG: flat_load_ubyte v[[LO:[0-9]+]],		; GCN-HSA-DAG: s_load_dword s[[LO:[0-9]+]],
; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]		; GCN-HSA: flat_store_dwordx2 v{{\[}}[[LO]]:{{[0-9]+}}{{\]}}, v{{\[}}{{[0-9]+}}:[[HI]]{{\]}}

; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
; EG: MOV {{.*}}, 0.0		; EG: MOV {{.*}}, 0.0
define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%a = load i8, i8 addrspace(2)* %in		%a = load i8, i8 addrspace(2)* %in
%ext = zext i8 %a to i64		%ext = zext i8 %a to i64
store i64 %ext, i64 addrspace(1)* %out		store i64 %ext, i64 addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_sextload_i8_to_i64:		; FUNC-LABEL: {{^}}constant_sextload_i8_to_i64:
; GCN-NOHSA: buffer_load_sbyte v[[LO:[0-9]+]],		; GCN-NOHSA: s_load_dword s[[LO:[0-9]+]],
; GCN-HSA: flat_load_sbyte v[[LO:[0-9]+]],		; GCN-HSA: s_load_dword s[[LO:[0-9]+]],
; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]		; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x80000

; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}		; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}		; GCN-HSA: flat_store_dwordx2 v{{\[}}{{[0-9]+}}:[[HI]]{{\]}}, v{{\[}}[[LO]]:{{[0-9]+}}{{\]}}

; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
; EG: ASHR {{\*}} {{T[0-9]\.[XYZW]}}, {{.}}, literal		; EG: ASHR {{\*}} {{T[0-9]\.[XYZW]}}, {{.}}, literal
; TODO: Why not 7 ?		; TODO: Why not 7 ?
; EG: 31		; EG: 31
define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%a = load i8, i8 addrspace(2)* %in		%a = load i8, i8 addrspace(2)* %in
%ext = sext i8 %a to i64		%ext = sext i8 %a to i64
▲ Show 20 Lines • Show All 139 Lines • ▼ Show 20 Lines
; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {		; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in		; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
; %ext = sext <64 x i8> %load to <64 x i64>		; %ext = sext <64 x i8> %load to <64 x i64>
; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out		; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
; ret void		; ret void
; }		; }

; FUNC-LABEL: {{^}}constant_zextload_i8_to_i16:		; FUNC-LABEL: {{^}}constant_zextload_i8_to_i16:
; GCN-NOHSA: buffer_load_ubyte v[[VAL:[0-9]+]],		; GCN-NOHSA: s_load_dword s[[VAL:[0-9]+]],
; GCN-NOHSA: buffer_store_short v[[VAL]]		; GCN-NOHSA: buffer_store_short v[[VAL]]

; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]],		; GCN-HSA: s_load_dword s[[VAL:[0-9]+]],
; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]		; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%a = load i8, i8 addrspace(2)* %in		%a = load i8, i8 addrspace(2)* %in
%ext = zext i8 %a to i16		%ext = zext i8 %a to i16
store i16 %ext, i16 addrspace(1)* %out		store i16 %ext, i16 addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_sextload_i8_to_i16:		; FUNC-LABEL: {{^}}constant_sextload_i8_to_i16:
; GCN-NOHSA: buffer_load_sbyte v[[VAL:[0-9]+]],		; GCN-NOHSA: s_load_dword s[[VAL:[0-9]+]],
; GCN-HSA: flat_load_sbyte v[[VAL:[0-9]+]],		; GCN-HSA: s_load_dword s[[VAL:[0-9]+]],

; GCN-NOHSA: buffer_store_short v[[VAL]]		; GCN-NOHSA: buffer_store_short v[[VAL]]
; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]		; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]

; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%a = load i8, i8 addrspace(2)* %in		%a = load i8, i8 addrspace(2)* %in
%ext = sext i8 %a to i16		%ext = sext i8 %a to i16
▲ Show 20 Lines • Show All 205 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/unaligned-load-store.ll

Context not available.
	}	}

	; SI-LABEL: {{^}}constant_align4_load_i8:	; SI-LABEL: {{^}}constant_align4_load_i8:
	; SI: buffer_load_ubyte	; SI: s_load_dword
	; SI: buffer_store_byte	; SI: buffer_store_byte
	define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {	define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
	%v = load i8, i8 addrspace(2)* %p, align 4	%v = load i8, i8 addrspace(2)* %p, align 4
Context not available.
	}	}

	; SI-LABEL: {{^}}constant_align2_load_i8:	; SI-LABEL: {{^}}constant_align2_load_i8:
	; SI: buffer_load_ubyte	; SI: s_load_dword
	; SI: buffer_store_byte	; SI: buffer_store_byte
	define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {	define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
	%v = load i8, i8 addrspace(2)* %p, align 2	%v = load i8, i8 addrspace(2)* %p, align 2
Context not available.

test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll

This file was added.

				; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=FUNC %s
				; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN-HSA -check-prefix=FUNC %s
				arsenmUnsubmitted Done Reply Inline Actions These are IR check lines, so shouldn't use GCN. You also don't need or use the HSA check prefix arsenm: These are IR check lines, so shouldn't use GCN. You also don't need or use the HSA check prefix
				; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN-NOHSA,FUNC %s
				arsenmUnsubmitted Done Reply Inline Actions These should be running just this pass with opt arsenm: These should be running just this pass with opt


				; FUNC-LABEL: {{^}}constant_load_i8:
				arsenmUnsubmitted Done Reply Inline Actions Don't use FUNC, you don't have this as a check prefix arsenm: Don't use FUNC, you don't have this as a check prefix
				; GCN: s_load_dword s{{[0-9]+}}
				; GCN-NOHSA: s_load_dword s{{[0-9]+}}
				; GCN-HSA: s_load_dword s{{[0-9]+}}

				define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
				%ld = load i8, i8 addrspace(2)* %in
				arsenmUnsubmitted Done Reply Inline Actions This should not be converted because you don't know if it's 4 byte aligned arsenm: This should not be converted because you don't know if it's 4 byte aligned
				store i8 %ld, i8 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}constant_load_v2i8:
				; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
				; GCN-NOHSA:s_load_dword s{{[0-9]+}}
				; GCN-HSA: s_load_dword s{{[0-9]+}}
				define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
				%ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
				store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}constant_load_v3i8:
				; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
				arsenmUnsubmitted Done Reply Inline Actions This isn't checking the relevant parts arsenm: This isn't checking the relevant parts
				; GCN-NOHSA:s_load_dword s{{[0-9]+}}
				; GCN-HSA: s_load_dword s{{[0-9]+}}
				define amdgpu_kernel void @constant_load_v3i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
				%ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
				store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
				ret void
				}


				; FUNC-LABEL: {{^}}constant_load_i16:
				; GCN: s_load_dword s{{[0-9]+}}
				; GCN-NOHSA:s_load_dword s{{[0-9]+}}
				; GCN-HSA: s_load_dword s{{[0-9]+}}
				define amdgpu_kernel void @constant_load_i16(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
				%ld = load i16, i16 addrspace(2)* %in
				%ext = sext i16 %ld to i32
				store i32 %ext, i32 addrspace(1)* %out
				ret void
				}

				arsenmUnsubmitted Done Reply Inline Actions Needs tests with half and <2 x half>, i1, and maybe another exotic size. I'm pretty sure this will assert for half as is now. Also need tests with various alignments and volatile arsenm: Needs tests with half and <2 x half>, i1, and maybe another exotic size. I'm pretty sure this…
				arsenmUnsubmitted Done Reply Inline Actions Also would be good to have a test specifically loading from the dispatch packet like happens in the workitem ID calculation arsenm: Also would be good to have a test specifically loading from the dispatch packet like happens in…
				attributes #0 = { nounwind }
				arsenmUnsubmitted Done Reply Inline Actions _v2f16 arsenm: _v2f16
				arsenmUnsubmitted Done Reply Inline Actions _f16 is the naming convention arsenm: _f16 is the naming convention
				arsenmUnsubmitted Done Reply Inline Actions Needs to check the type arsenm: Needs to check the type
				arsenmUnsubmitted Done Reply Inline Actions This should check the integer type/operands arsenm: This should check the integer type/operands
				arsenmUnsubmitted Done Reply Inline Actions Spelling _addrespace1 arsenm: Spelling _addrespace1

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU : Widen extending scalar loads to 32-bits
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 107684

lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

test/CodeGen/AMDGPU/load-constant-i1.ll

test/CodeGen/AMDGPU/load-constant-i16.ll

test/CodeGen/AMDGPU/load-constant-i8.ll

test/CodeGen/AMDGPU/unaligned-load-store.ll

test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU : Widen extending scalar loads to 32-bitsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 107684

lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

test/CodeGen/AMDGPU/load-constant-i1.ll

test/CodeGen/AMDGPU/load-constant-i16.ll

test/CodeGen/AMDGPU/load-constant-i8.ll

test/CodeGen/AMDGPU/unaligned-load-store.ll

test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll

AMDGPU : Widen extending scalar loads to 32-bits
ClosedPublic