Diff 106894

lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Show All 12 Lines
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "AMDGPU.h"		#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"		#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"		#include "AMDGPUTargetMachine.h"
#include "llvm/ADT/StringRef.h"		#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/DivergenceAnalysis.h"		#include "llvm/Analysis/DivergenceAnalysis.h"
		#include "llvm/Analysis/Loads.h"
#include "llvm/CodeGen/Passes.h"		#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"		#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"		#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"		#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"		#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"		#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"		#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"		#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/InstrTypes.h"		#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"		#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"		#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"		#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Operator.h"		#include "llvm/IR/Operator.h"
#include "llvm/IR/Type.h"		#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"		#include "llvm/IR/Value.h"
#include "llvm/Pass.h"		#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"		#include "llvm/Support/Casting.h"
		#include "AMDGPU.h"
		kzhuravlUnsubmitted Done Reply Inline Actions Included twice. kzhuravl: Included twice.
#include <cassert>		#include <cassert>
#include <iterator>		#include <iterator>

#define DEBUG_TYPE "amdgpu-codegenprepare"		#define DEBUG_TYPE "amdgpu-codegenprepare"

using namespace llvm;		using namespace llvm;

namespace {		namespace {

class AMDGPUCodeGenPrepare : public FunctionPass,		class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {		public InstVisitor<AMDGPUCodeGenPrepare, bool> {
const SISubtarget *ST = nullptr;		const SISubtarget *ST = nullptr;
DivergenceAnalysis *DA = nullptr;		DivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;		Module *Mod = nullptr;
bool HasUnsafeFPMath = false;		bool HasUnsafeFPMath = false;
		AMDGPUAS AMDGPUASI;

/// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to		/// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
/// binary operation \p V.		/// binary operation \p V.
///		///
/// \returns Binary operation \p V.		/// \returns Binary operation \p V.
/// \returns \p T's base element bit width.		/// \returns \p T's base element bit width.
unsigned getBaseElementBitWidth(const Type *T) const;		unsigned getBaseElementBitWidth(const Type *T) const;

▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	class AMDGPUCodeGenPrepare : public FunctionPass,
/// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the		/// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
/// shift amount is 32 minus \p I's base element bit width), and truncating		/// shift amount is 32 minus \p I's base element bit width), and truncating
/// the result of the shift operation back to \p I's original type.		/// the result of the shift operation back to \p I's original type.
///		///
/// \returns True.		/// \returns True.
bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;		bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;

public:		public:
static char ID;		static char ID;

AMDGPUCodeGenPrepare() : FunctionPass(ID) {}		AMDGPUCodeGenPrepare() : FunctionPass(ID) {}

bool visitFDiv(BinaryOperator &I);		bool visitFDiv(BinaryOperator &I);

		arsenmUnsubmitted Done Reply Inline Actions The comment explains too specifically what it is doing, it should be describing intent and why. arsenm: The comment explains too specifically what it is doing, it should be describing intent and why.
bool visitInstruction(Instruction &I) { return false; }		bool visitInstruction(Instruction &I) { return false; }
bool visitBinaryOperator(BinaryOperator &I);		bool visitBinaryOperator(BinaryOperator &I);
		bool visitLoadInst(LoadInst &I);
bool visitICmpInst(ICmpInst &I);		bool visitICmpInst(ICmpInst &I);
bool visitSelectInst(SelectInst &I);		bool visitSelectInst(SelectInst &I);

bool visitIntrinsicInst(IntrinsicInst &I);		bool visitIntrinsicInst(IntrinsicInst &I);
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);		bool visitBitreverseIntrinsicInst(IntrinsicInst &I);

bool doInitialization(Module &M) override;		bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;		bool runOnFunction(Function &F) override;
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	case Instruction::Mul:
return true;		return true;
case Instruction::Sub:		case Instruction::Sub:
return I.hasNoUnsignedWrap();		return I.hasNoUnsignedWrap();
default:		default:
return false;		return false;
}		}
}		}

bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {		bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
		arsenmUnsubmitted Done Reply Inline Actions This doesn't actually do the widening, so the name should be something like canWidenScalarExtLoad arsenm: This doesn't actually do the widening, so the name should be something like…
assert(needsPromotionToI32(I.getType()) &&		assert(needsPromotionToI32(I.getType()) &&
"I does not need promotion to i32");		"I does not need promotion to i32");

if (I.getOpcode() == Instruction::SDiv \|\|		if (I.getOpcode() == Instruction::SDiv \|\|
I.getOpcode() == Instruction::UDiv)		I.getOpcode() == Instruction::UDiv)
return false;		return false;

		arsenmUnsubmitted Done Reply Inline Actions This should really be I.isSimple(), in case it's atomic. arsenm: This should really be I.isSimple(), in case it's atomic.
IRBuilder<> Builder(&I);		IRBuilder<> Builder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());		Builder.SetCurrentDebugLocation(I.getDebugLoc());

Type *I32Ty = getI32Ty(Builder, I.getType());		Type *I32Ty = getI32Ty(Builder, I.getType());
Value *ExtOp0 = nullptr;		Value *ExtOp0 = nullptr;
Value *ExtOp1 = nullptr;		Value *ExtOp1 = nullptr;
Value *ExtRes = nullptr;		Value *ExtRes = nullptr;
Value *TruncRes = nullptr;		Value *TruncRes = nullptr;
▲ Show 20 Lines • Show All 194 Lines • ▼ Show 20 Lines	bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {

if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&		if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
DA->isUniform(&I))		DA->isUniform(&I))
Changed \|= promoteUniformOpToI32(I);		Changed \|= promoteUniformOpToI32(I);

return Changed;		return Changed;
}		}

		bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
		Type * Ty = I.getType();
		arsenmUnsubmitted Done Reply Inline Actions There's no point to having this since you never set it to anything else arsenm: There's no point to having this since you never set it to anything else
		arsenmUnsubmitted Done Reply Inline Actions No space after * arsenm: No space after *
		VectorType *VT = dyn_cast<VectorType>(Ty);
		if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
		arsenmUnsubmitted Done Reply Inline Actions M->getDataLayout() arsenm: M->getDataLayout()
		!I.isVolatile() && (!VT \|\| (VT && VT->getBitWidth() < 32)) &&
		arsenmUnsubmitted Done Reply Inline Actions This should be able to handle vectors. This should also use the DataLayout so it works for pointers arsenm: This should be able to handle vectors. This should also use the DataLayout so it works for…
		wdngAuthorUnsubmitted Done Reply Inline Actions This one (VT && VT->getBitWidth() < 32) is able to handle vectors with bitwidth < 32 or scalar (!VT). Are you saying to use DataLayout to handle the pointer dereferenceable issue? wdng: This one (VT && VT->getBitWidth() < 32) is able to handle vectors with bitwidth < 32 or scalar…
		needsPromotionToI32(I.getType()) && DA->isUniform(&I)) {
		IRBuilder<> Builder(&I);
		arsenmUnsubmitted Done Reply Inline Actions Ideally this would handle vectors like <2 x i8> arsenm: Ideally this would handle vectors like <2 x i8>
		Builder.SetCurrentDebugLocation(I.getDebugLoc());
		arsenmUnsubmitted Done Reply Inline Actions The address space check should be first arsenm: The address space check should be first
		Type *I32Ty = getI32Ty(Builder, I.getType());
		arsenmUnsubmitted Done Reply Inline Actions getI32Ty is the wrong thing to use here arsenm: getI32Ty is the wrong thing to use here
		arsenmUnsubmitted Not Done Reply Inline Actions Move align check before DA. It would also be better to move the datalayout alignment check into a helper function checked here arsenm: Move align check before DA. It would also be better to move the datalayout alignment check into…
		Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
		Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
		Value *WidenLoad = Builder.CreateLoad(BitCast);
		arsenmUnsubmitted Done Reply Inline Actions The builder already has a getInt32Ty arsenm: The builder already has a getInt32Ty
		Value *TruncRes = Builder.CreateTrunc(WidenLoad, I.getType());

		arsenmUnsubmitted Done Reply Inline Actions I.getPointerOperand arsenm: I.getPointerOperand
		I.replaceAllUsesWith(TruncRes);
		I.eraseFromParent();
		return true;
		}

		return false;
		}

		arsenmUnsubmitted Done Reply Inline Actions This should also skip volatile loads arsenm: This should also skip volatile loads
bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {		bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
bool Changed = false;		bool Changed = false;

if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&		if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
DA->isUniform(&I))		DA->isUniform(&I))
Changed \|= promoteUniformOpToI32(I);		Changed \|= promoteUniformOpToI32(I);
		arsenmUnsubmitted Done Reply Inline Actions You don't need an entire block of code that is mostly the same for the vector case. The non-vector case requires a bitcast as well. arsenm: You don't need an entire block of code that is mostly the same for the vector case. The non…

return Changed;		return Changed;
}		}

bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {		bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
bool Changed = false;		bool Changed = false;

if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&		if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
▲ Show 20 Lines • Show All 67 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/load-constant-i16.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
	; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s			; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
	; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s			; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s

	; FUNC-LABEL: {{^}}constant_load_i16:			; FUNC-LABEL: {{^}}constant_load_i16:
	; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}}			; GCN-NOHSA: s_load_dword s{{[0-9]+}}
	; GCN-HSA: flat_load_ushort			; GCN-HSA: s_load_dword

	; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1			; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
	define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {			define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
	entry:			entry:
	%ld = load i16, i16 addrspace(2)* %in			%ld = load i16, i16 addrspace(2)* %in
	store i16 %ld, i16 addrspace(1)* %out			store i16 %ld, i16 addrspace(1)* %out
	ret void			ret void
	}			}
	▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {			define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
	entry:			entry:
	%ld = load <16 x i16>, <16 x i16> addrspace(2)* %in			%ld = load <16 x i16>, <16 x i16> addrspace(2)* %in
	store <16 x i16> %ld, <16 x i16> addrspace(1)* %out			store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_zextload_i16_to_i32:			; FUNC-LABEL: {{^}}constant_zextload_i16_to_i32:
	; GCN-NOHSA: buffer_load_ushort			; GCN-NOHSA: s_load_dword
	; GCN-NOHSA: buffer_store_dword			; GCN-NOHSA: buffer_store_dword

	; GCN-HSA: flat_load_ushort			; GCN-HSA: s_load_dword
	; GCN-HSA: flat_store_dword			; GCN-HSA: flat_store_dword

	; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1			; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
	define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
	%a = load i16, i16 addrspace(2)* %in			%a = load i16, i16 addrspace(2)* %in
	%ext = zext i16 %a to i32			%ext = zext i16 %a to i32
	store i32 %ext, i32 addrspace(1)* %out			store i32 %ext, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_sextload_i16_to_i32:			; FUNC-LABEL: {{^}}constant_sextload_i16_to_i32:
	; GCN-NOHSA: buffer_load_sshort			; GCN-NOHSA: s_load_dword
	; GCN-NOHSA: buffer_store_dword			; GCN-NOHSA: buffer_store_dword

	; GCN-HSA: flat_load_sshort			; GCN-HSA: s_load_dword
	; GCN-HSA: flat_store_dword			; GCN-HSA: flat_store_dword

	; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1			; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
	; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal			; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
	; EG: 16			; EG: 16
	define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
	%a = load i16, i16 addrspace(2)* %in			%a = load i16, i16 addrspace(2)* %in
	%ext = sext i16 %a to i32			%ext = sext i16 %a to i32
	store i32 %ext, i32 addrspace(1)* %out			store i32 %ext, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i32:			; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i32:
	; GCN-NOHSA: buffer_load_ushort			; GCN-NOHSA: s_load_dword
	; GCN-HSA: flat_load_ushort			; GCN-HSA: s_load_dword

	; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1			; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
	define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
	%load = load <1 x i16>, <1 x i16> addrspace(2)* %in			%load = load <1 x i16>, <1 x i16> addrspace(2)* %in
	%ext = zext <1 x i16> %load to <1 x i32>			%ext = zext <1 x i16> %load to <1 x i32>
	store <1 x i32> %ext, <1 x i32> addrspace(1)* %out			store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i32:			; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i32:
	; GCN-NOHSA: buffer_load_sshort			; GCN-NOHSA: s_load_dword
	; GCN-HSA: flat_load_sshort			; GCN-HSA: s_load_dword

	; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1			; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
	; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal			; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
	; EG: 16			; EG: 16
	define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
	%load = load <1 x i16>, <1 x i16> addrspace(2)* %in			%load = load <1 x i16>, <1 x i16> addrspace(2)* %in
	%ext = sext <1 x i16> %load to <1 x i32>			%ext = sext <1 x i16> %load to <1 x i32>
	store <1 x i32> %ext, <1 x i32> addrspace(1)* %out			store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
	▲ Show 20 Lines • Show All 294 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
	%load = load <64 x i16>, <64 x i16> addrspace(2)* %in			%load = load <64 x i16>, <64 x i16> addrspace(2)* %in
	%ext = sext <64 x i16> %load to <64 x i32>			%ext = sext <64 x i16> %load to <64 x i32>
	store <64 x i32> %ext, <64 x i32> addrspace(1)* %out			store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_zextload_i16_to_i64:			; FUNC-LABEL: {{^}}constant_zextload_i16_to_i64:
	; GCN-NOHSA-DAG: buffer_load_ushort v[[LO:[0-9]+]],			; GCN-NOHSA-DAG: s_load_dword s[[LO:[0-9]+]],
	; GCN-HSA-DAG: flat_load_ushort v[[LO:[0-9]+]],			; GCN-HSA-DAG: s_load_dword s[[LO:[0-9]+]],
	; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}			; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}

	; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]			; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
	; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}			; GCN-HSA: flat_store_dwordx2 v{{\[}}[[LO]]:{{[0-9]+}}{{\]}}, v{{\[}}{{[0-9]+}}:[[HI]]{{\]}}

	; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1			; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
	; EG: MOV {{.*}}, 0.0			; EG: MOV {{.*}}, 0.0
	define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
	%a = load i16, i16 addrspace(2)* %in			%a = load i16, i16 addrspace(2)* %in
	%ext = zext i16 %a to i64			%ext = zext i16 %a to i64
	store i64 %ext, i64 addrspace(1)* %out			store i64 %ext, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}constant_sextload_i16_to_i64:			; FUNC-LABEL: {{^}}constant_sextload_i16_to_i64:
	; FIXME: Need to optimize this sequence to avoid extra bfe:			; FIXME: Need to optimize this sequence to avoid extra bfe:
	; t28: i32,ch = load<LD2[%in(addrspace=1)], anyext from i16> t12, t27, undef:i64			; t28: i32,ch = load<LD2[%in(addrspace=1)], anyext from i16> t12, t27, undef:i64
	; t31: i64 = any_extend t28			; t31: i64 = any_extend t28
	; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16			; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16

	; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]],			; GCN-NOHSA-SI-DAG: s_load_dword s[[LO:[0-9]+]],
	; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]],			; GCN-HSA-DAG: s_load_dword s[[LO:[0-9]+]],
	; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]],			; GCN-NOHSA-VI-DAG: s_load_dword s[[ULO:[0-9]+]],
	; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16			; GCN-NOHSA-VI-DAG: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000
	; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]

	; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]			; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
	; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}			; GCN-HSA: flat_store_dwordx2 v{{\[}}{{[0-9]+}}:[[HI]]{{\]}}, v{{\[}}[[LO]]:{{[0-9]+}}{{\]}}

	; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1			; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
	; EG: ASHR {{\*}} {{T[0-9]\.[XYZW]}}, {{.}}, literal			; EG: ASHR {{\*}} {{T[0-9]\.[XYZW]}}, {{.}}, literal
	; TODO: These could be expanded earlier using ASHR 15			; TODO: These could be expanded earlier using ASHR 15
	; EG: 31			; EG: 31
	define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {			define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
	%a = load i16, i16 addrspace(2)* %in			%a = load i16, i16 addrspace(2)* %in
	%ext = sext i16 %a to i64			%ext = sext i16 %a to i64
	▲ Show 20 Lines • Show All 155 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/load-constant-i8.ll

; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s		; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s		; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s		; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s		; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s


; FUNC-LABEL: {{^}}constant_load_i8:		; FUNC-LABEL: {{^}}constant_load_i8:
; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}		; GCN-NOHSA: s_load_dword s{{[0-9]+}}
; GCN-HSA: flat_load_ubyte		; GCN-HSA: s_load_dword

; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
; TODO: NOT AND		; TODO: NOT AND
define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
entry:		entry:
%ld = load i8, i8 addrspace(2)* %in		%ld = load i8, i8 addrspace(2)* %in
store i8 %ld, i8 addrspace(1)* %out		store i8 %ld, i8 addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_load_v2i8:		; FUNC-LABEL: {{^}}constant_load_v2i8:
; GCN-NOHSA: buffer_load_ushort v		; GCN-NOHSA: s_load_dwordx2 s
; GCN-HSA: flat_load_ushort v		; GCN-HSA: s_load_dwordx2 s

; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
entry:		entry:
%ld = load <2 x i8>, <2 x i8> addrspace(2)* %in		%ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
store <2 x i8> %ld, <2 x i8> addrspace(1)* %out		store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_load_v3i8:		; FUNC-LABEL: {{^}}constant_load_v3i8:
; GCN: s_load_dword s		; GCN: s_load_dwordx4 s

; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
entry:		entry:
%ld = load <3 x i8>, <3 x i8> addrspace(2)* %in		%ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
store <3 x i8> %ld, <3 x i8> addrspace(1)* %out		store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
ret void		ret void
}		}
Show All 27 Lines
define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
entry:		entry:
%ld = load <16 x i8>, <16 x i8> addrspace(2)* %in		%ld = load <16 x i8>, <16 x i8> addrspace(2)* %in
store <16 x i8> %ld, <16 x i8> addrspace(1)* %out		store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_zextload_i8_to_i32:		; FUNC-LABEL: {{^}}constant_zextload_i8_to_i32:
; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}},		; GCN-NOHSA: s_load_dword s{{[0-9]+}},
; GCN-HSA: flat_load_ubyte		; GCN-HSA: s_load_dword

; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%a = load i8, i8 addrspace(2)* %in		%a = load i8, i8 addrspace(2)* %in
%ext = zext i8 %a to i32		%ext = zext i8 %a to i32
store i32 %ext, i32 addrspace(1)* %out		store i32 %ext, i32 addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_sextload_i8_to_i32:		; FUNC-LABEL: {{^}}constant_sextload_i8_to_i32:
; GCN-NOHSA: buffer_load_sbyte		; GCN-NOHSA: s_load_dword
; GCN-HSA: flat_load_sbyte		; GCN-HSA: s_load_dword

; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal		; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
; EG: 8		; EG: 8
define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%ld = load i8, i8 addrspace(2)* %in		%ld = load i8, i8 addrspace(2)* %in
%ext = sext i8 %ld to i32		%ext = sext i8 %ld to i32
store i32 %ext, i32 addrspace(1)* %out		store i32 %ext, i32 addrspace(1)* %out
Show All 18 Lines
define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
%load = load <1 x i8>, <1 x i8> addrspace(2)* %in		%load = load <1 x i8>, <1 x i8> addrspace(2)* %in
%ext = sext <1 x i8> %load to <1 x i32>		%ext = sext <1 x i8> %load to <1 x i32>
store <1 x i32> %ext, <1 x i32> addrspace(1)* %out		store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i32:		; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i32:
; GCN-NOHSA: buffer_load_ushort		; GCN-NOHSA: s_load_dword
; GCN-HSA: flat_load_ushort		; GCN-HSA: s_load_dword

; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
; TODO: This should use DST, but for some there are redundant MOVs		; TODO: This should use DST, but for some there are redundant MOVs
; EG: BFE_UINT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, {{.*}}literal		; EG: BFE_UINT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, {{.*}}literal
; EG: 8		; EG: 8
define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
%load = load <2 x i8>, <2 x i8> addrspace(2)* %in		%load = load <2 x i8>, <2 x i8> addrspace(2)* %in
%ext = zext <2 x i8> %load to <2 x i32>		%ext = zext <2 x i8> %load to <2 x i32>
store <2 x i32> %ext, <2 x i32> addrspace(1)* %out		store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i32:		; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i32:
; GCN-NOHSA: buffer_load_ushort		; GCN-NOHSA: s_load_dwordx2

; GCN-HSA: flat_load_ushort		; GCN-HSA: s_load_dword

; GCN: v_bfe_i32		; GCN: s_sext_i32
; GCN: v_bfe_i32		; GCN: s_sext_i32

; EG: VTX_READ_16 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_16 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
; TODO: These should use DST, but for some there are redundant MOVs		; TODO: These should use DST, but for some there are redundant MOVs
; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal		; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal
; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal		; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal
; EG-DAG: 8		; EG-DAG: 8
; EG-DAG: 8		; EG-DAG: 8
define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
%load = load <2 x i8>, <2 x i8> addrspace(2)* %in		%load = load <2 x i8>, <2 x i8> addrspace(2)* %in
%ext = sext <2 x i8> %load to <2 x i32>		%ext = sext <2 x i8> %load to <2 x i32>
store <2 x i32> %ext, <2 x i32> addrspace(1)* %out		store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_zextload_v3i8_to_v3i32:		; FUNC-LABEL: {{^}}constant_zextload_v3i8_to_v3i32:
; GCN: s_load_dword s		; GCN: s_load_dwordx4 s

; GCN-DAG: s_bfe_u32		; GCN-DAG: s_and_b32
; GCN-DAG: s_bfe_u32		; GCN-DAG: s_and_b32
; GCN-DAG: s_and_b32		; GCN-DAG: s_and_b32

; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
; TODO: These should use DST, but for some there are redundant MOVs		; TODO: These should use DST, but for some there are redundant MOVs
; EG-DAG: BFE_UINT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, {{.*}}literal		; EG-DAG: BFE_UINT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, {{.*}}literal
; EG-DAG: BFE_UINT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, {{.*}}literal		; EG-DAG: BFE_UINT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, {{.*}}literal
; EG-DAG: 8		; EG-DAG: 8
; EG-DAG: 8		; EG-DAG: 8
define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
entry:		entry:
%ld = load <3 x i8>, <3 x i8> addrspace(2)* %in		%ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
%ext = zext <3 x i8> %ld to <3 x i32>		%ext = zext <3 x i8> %ld to <3 x i32>
store <3 x i32> %ext, <3 x i32> addrspace(1)* %out		store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_sextload_v3i8_to_v3i32:		; FUNC-LABEL: {{^}}constant_sextload_v3i8_to_v3i32:
; GCN: s_load_dword s		; GCN: s_load_dwordx4 s

; GCN-DAG: s_bfe_i32		; GCN-DAG: s_sext_i32
; GCN-DAG: s_bfe_i32		; GCN-DAG: s_sext_i32
; GCN-DAG: s_bfe_i32		; GCN-DAG: s_sext_i32

; EG: VTX_READ_32 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_32 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
; TODO: These should use DST, but for some there are redundant MOVs		; TODO: These should use DST, but for some there are redundant MOVs
; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal		; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal
; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal		; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal
; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal		; EG-DAG: BFE_INT {{[* ]}}T{{[0-9].[XYZW]}}, {{.}}, 0.0, literal
; EG-DAG: 8		; EG-DAG: 8
; EG-DAG: 8		; EG-DAG: 8
▲ Show 20 Lines • Show All 362 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
%ext = sext <64 x i8> %load to <64 x i32>		%ext = sext <64 x i8> %load to <64 x i32>
store <64 x i32> %ext, <64 x i32> addrspace(1)* %out		store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_zextload_i8_to_i64:		; FUNC-LABEL: {{^}}constant_zextload_i8_to_i64:
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}		; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}

; GCN-NOHSA-DAG: buffer_load_ubyte v[[LO:[0-9]+]],		; GCN-NOHSA-DAG: s_load_dword s[[LO:[0-9]+]],
; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]		; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]

; GCN-HSA-DAG: flat_load_ubyte v[[LO:[0-9]+]],		; GCN-HSA-DAG: s_load_dword s[[LO:[0-9]+]],
; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]		; GCN-HSA: flat_store_dwordx2 v{{\[}}[[LO]]:{{[0-9]+}}{{\]}}, v{{\[}}{{[0-9]+}}:[[HI]]{{\]}}

; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
; EG: MOV {{.*}}, 0.0		; EG: MOV {{.*}}, 0.0
define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%a = load i8, i8 addrspace(2)* %in		%a = load i8, i8 addrspace(2)* %in
%ext = zext i8 %a to i64		%ext = zext i8 %a to i64
store i64 %ext, i64 addrspace(1)* %out		store i64 %ext, i64 addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_sextload_i8_to_i64:		; FUNC-LABEL: {{^}}constant_sextload_i8_to_i64:
; GCN-NOHSA: buffer_load_sbyte v[[LO:[0-9]+]],		; GCN-NOHSA: s_load_dword s[[LO:[0-9]+]],
; GCN-HSA: flat_load_sbyte v[[LO:[0-9]+]],		; GCN-HSA: s_load_dword s[[LO:[0-9]+]],
; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]		; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x80000

; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}		; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}		; GCN-HSA: flat_store_dwordx2 v{{\[}}{{[0-9]+}}:[[HI]]{{\]}}, v{{\[}}[[LO]]:{{[0-9]+}}{{\]}}

; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
; EG: ASHR {{\*}} {{T[0-9]\.[XYZW]}}, {{.}}, literal		; EG: ASHR {{\*}} {{T[0-9]\.[XYZW]}}, {{.}}, literal
; TODO: Why not 7 ?		; TODO: Why not 7 ?
; EG: 31		; EG: 31
define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%a = load i8, i8 addrspace(2)* %in		%a = load i8, i8 addrspace(2)* %in
%ext = sext i8 %a to i64		%ext = sext i8 %a to i64
▲ Show 20 Lines • Show All 139 Lines • ▼ Show 20 Lines
; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {		; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in		; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
; %ext = sext <64 x i8> %load to <64 x i64>		; %ext = sext <64 x i8> %load to <64 x i64>
; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out		; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
; ret void		; ret void
; }		; }

; FUNC-LABEL: {{^}}constant_zextload_i8_to_i16:		; FUNC-LABEL: {{^}}constant_zextload_i8_to_i16:
; GCN-NOHSA: buffer_load_ubyte v[[VAL:[0-9]+]],		; GCN-NOHSA: s_load_dword s[[VAL:[0-9]+]],
; GCN-NOHSA: buffer_store_short v[[VAL]]		; GCN-NOHSA: buffer_store_short v[[VAL]]

; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]],		; GCN-HSA: s_load_dword s[[VAL:[0-9]+]],
; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]		; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%a = load i8, i8 addrspace(2)* %in		%a = load i8, i8 addrspace(2)* %in
%ext = zext i8 %a to i16		%ext = zext i8 %a to i16
store i16 %ext, i16 addrspace(1)* %out		store i16 %ext, i16 addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_sextload_i8_to_i16:		; FUNC-LABEL: {{^}}constant_sextload_i8_to_i16:
; GCN-NOHSA: buffer_load_sbyte v[[VAL:[0-9]+]],		; GCN-NOHSA: s_load_dword s[[VAL:[0-9]+]],
; GCN-HSA: flat_load_sbyte v[[VAL:[0-9]+]],		; GCN-HSA: s_load_dword s[[VAL:[0-9]+]],

; GCN-NOHSA: buffer_store_short v[[VAL]]		; GCN-NOHSA: buffer_store_short v[[VAL]]
; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]		; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]

; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1		; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {		define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%a = load i8, i8 addrspace(2)* %in		%a = load i8, i8 addrspace(2)* %in
%ext = sext i8 %a to i16		%ext = sext i8 %a to i16
▲ Show 20 Lines • Show All 205 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/unaligned-load-store.ll

	Show First 20 Lines • Show All 513 Lines • ▼ Show 20 Lines
	; SI: buffer_store_dwordx4			; SI: buffer_store_dwordx4
	define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {			define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
	%v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1			%v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1
	store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4			store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
	ret void			ret void
	}			}

	; SI-LABEL: {{^}}constant_align4_load_i8:			; SI-LABEL: {{^}}constant_align4_load_i8:
	; SI: buffer_load_ubyte			; SI: s_load_dword
	; SI: buffer_store_byte			; SI: buffer_store_byte
	define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {			define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
	%v = load i8, i8 addrspace(2)* %p, align 4			%v = load i8, i8 addrspace(2)* %p, align 4
	store i8 %v, i8 addrspace(1)* %r, align 4			store i8 %v, i8 addrspace(1)* %r, align 4
	ret void			ret void
	}			}

	; SI-LABEL: {{^}}constant_align2_load_i8:			; SI-LABEL: {{^}}constant_align2_load_i8:
	; SI: buffer_load_ubyte			; SI: s_load_dword
	; SI: buffer_store_byte			; SI: buffer_store_byte
	define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {			define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
	%v = load i8, i8 addrspace(2)* %p, align 2			%v = load i8, i8 addrspace(2)* %p, align 2
	store i8 %v, i8 addrspace(1)* %r, align 2			store i8 %v, i8 addrspace(1)* %r, align 2
	ret void			ret void
	}			}

	; SI-LABEL: {{^}}constant_align4_merge_load_2_i32:			; SI-LABEL: {{^}}constant_align4_merge_load_2_i32:
	▲ Show 20 Lines • Show All 65 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU : Widen extending scalar loads to 32-bits
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 106894

lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

test/CodeGen/AMDGPU/load-constant-i16.ll

test/CodeGen/AMDGPU/load-constant-i8.ll

test/CodeGen/AMDGPU/unaligned-load-store.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU : Widen extending scalar loads to 32-bitsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 106894

lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

test/CodeGen/AMDGPU/load-constant-i16.ll

test/CodeGen/AMDGPU/load-constant-i8.ll

test/CodeGen/AMDGPU/unaligned-load-store.ll

AMDGPU : Widen extending scalar loads to 32-bits
ClosedPublic