Diff 108113

lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Show All 12 Lines
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "AMDGPU.h"		#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"		#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"		#include "AMDGPUTargetMachine.h"
#include "llvm/ADT/StringRef.h"		#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/DivergenceAnalysis.h"		#include "llvm/Analysis/DivergenceAnalysis.h"
		#include "llvm/Analysis/Loads.h"
#include "llvm/CodeGen/Passes.h"		#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"		#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"		#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"		#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"		#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"		#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"		#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"		#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/InstrTypes.h"		#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"		#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"		#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"		#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Operator.h"		#include "llvm/IR/Operator.h"
#include "llvm/IR/Type.h"		#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"		#include "llvm/IR/Value.h"
#include "llvm/Pass.h"		#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"		#include "llvm/Support/Casting.h"
#include <cassert>		#include <cassert>
		kzhuravlUnsubmitted Done Reply Inline Actions Included twice. kzhuravl: Included twice.
#include <iterator>		#include <iterator>

#define DEBUG_TYPE "amdgpu-codegenprepare"		#define DEBUG_TYPE "amdgpu-codegenprepare"

using namespace llvm;		using namespace llvm;

namespace {		namespace {

class AMDGPUCodeGenPrepare : public FunctionPass,		class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {		public InstVisitor<AMDGPUCodeGenPrepare, bool> {
const SISubtarget *ST = nullptr;		const SISubtarget *ST = nullptr;
DivergenceAnalysis *DA = nullptr;		DivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;		Module *Mod = nullptr;
bool HasUnsafeFPMath = false;		bool HasUnsafeFPMath = false;
		AMDGPUAS AMDGPUASI;

/// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to		/// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
/// binary operation \p V.		/// binary operation \p V.
///		///
/// \returns Binary operation \p V.		/// \returns Binary operation \p V.
/// \returns \p T's base element bit width.		/// \returns \p T's base element bit width.
unsigned getBaseElementBitWidth(const Type *T) const;		unsigned getBaseElementBitWidth(const Type *T) const;

▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	class AMDGPUCodeGenPrepare : public FunctionPass,
/// than or equal 16. Promotion is done by zero extending the operand to 32		/// than or equal 16. Promotion is done by zero extending the operand to 32
/// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the		/// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
/// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the		/// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
/// shift amount is 32 minus \p I's base element bit width), and truncating		/// shift amount is 32 minus \p I's base element bit width), and truncating
/// the result of the shift operation back to \p I's original type.		/// the result of the shift operation back to \p I's original type.
///		///
/// \returns True.		/// \returns True.
bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;		bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
		/// \brief Widen a scalar load.
		///
		/// \details \p Widen scalar load for uniform, small type loads from constant
		// memory / to a full 32-bits and then truncate the input to allow a scalar
		// load instead of a vector load.
		//
		/// \returns True.

		arsenmUnsubmitted Done Reply Inline Actions The comment explains too specifically what it is doing, it should be describing intent and why. arsenm: The comment explains too specifically what it is doing, it should be describing intent and why.
		bool widenScalarLoadtoI32(LoadInst &I) const;

public:		public:
static char ID;		static char ID;

AMDGPUCodeGenPrepare() : FunctionPass(ID) {}		AMDGPUCodeGenPrepare() : FunctionPass(ID) {}

bool visitFDiv(BinaryOperator &I);		bool visitFDiv(BinaryOperator &I);

bool visitInstruction(Instruction &I) { return false; }		bool visitInstruction(Instruction &I) { return false; }
bool visitBinaryOperator(BinaryOperator &I);		bool visitBinaryOperator(BinaryOperator &I);
		bool visitLoadInst(LoadInst &I);
bool visitICmpInst(ICmpInst &I);		bool visitICmpInst(ICmpInst &I);
bool visitSelectInst(SelectInst &I);		bool visitSelectInst(SelectInst &I);

bool visitIntrinsicInst(IntrinsicInst &I);		bool visitIntrinsicInst(IntrinsicInst &I);
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);		bool visitBitreverseIntrinsicInst(IntrinsicInst &I);

bool doInitialization(Module &M) override;		bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;		bool runOnFunction(Function &F) override;
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	case Instruction::Mul:
return true;		return true;
case Instruction::Sub:		case Instruction::Sub:
return I.hasNoUnsignedWrap();		return I.hasNoUnsignedWrap();
default:		default:
return false;		return false;
}		}
}		}

		bool AMDGPUCodeGenPrepare::widenScalarLoadtoI32(LoadInst &I) const {
		arsenmUnsubmitted Done Reply Inline Actions This doesn't actually do the widening, so the name should be something like canWidenScalarExtLoad arsenm: This doesn't actually do the widening, so the name should be something like…
		Type *Ty = I.getType();
		const DataLayout &DL = Mod->getDataLayout();
		int TySize = DL.getTypeSizeInBits(Ty);
		unsigned Align = I.getAlignment() ?
		I.getAlignment() : DL.getABITypeAlignment(Ty);

		return !I.isVolatile() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
		arsenmUnsubmitted Done Reply Inline Actions This should really be I.isSimple(), in case it's atomic. arsenm: This should really be I.isSimple(), in case it's atomic.
		}

bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {		bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
assert(needsPromotionToI32(I.getType()) &&		assert(needsPromotionToI32(I.getType()) &&
"I does not need promotion to i32");		"I does not need promotion to i32");

if (I.getOpcode() == Instruction::SDiv \|\|		if (I.getOpcode() == Instruction::SDiv \|\|
I.getOpcode() == Instruction::UDiv)		I.getOpcode() == Instruction::UDiv)
return false;		return false;

▲ Show 20 Lines • Show All 202 Lines • ▼ Show 20 Lines	bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {

if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&		if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
DA->isUniform(&I))		DA->isUniform(&I))
Changed \|= promoteUniformOpToI32(I);		Changed \|= promoteUniformOpToI32(I);

return Changed;		return Changed;
}		}

		bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
		if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
		arsenmUnsubmitted Done Reply Inline Actions There's no point to having this since you never set it to anything else arsenm: There's no point to having this since you never set it to anything else
		arsenmUnsubmitted Done Reply Inline Actions No space after * arsenm: No space after *
		widenScalarLoadtoI32(I)) {
		IRBuilder<> Builder(&I);
		arsenmUnsubmitted Done Reply Inline Actions M->getDataLayout() arsenm: M->getDataLayout()
		Builder.SetCurrentDebugLocation(I.getDebugLoc());
		arsenmUnsubmitted Done Reply Inline Actions This should be able to handle vectors. This should also use the DataLayout so it works for pointers arsenm: This should be able to handle vectors. This should also use the DataLayout so it works for…
		wdngAuthorUnsubmitted Done Reply Inline Actions This one (VT && VT->getBitWidth() < 32) is able to handle vectors with bitwidth < 32 or scalar (!VT). Are you saying to use DataLayout to handle the pointer dereferenceable issue? wdng: This one (VT && VT->getBitWidth() < 32) is able to handle vectors with bitwidth < 32 or scalar…

		Type *I32Ty = Builder.getInt32Ty();
		arsenmUnsubmitted Done Reply Inline Actions Ideally this would handle vectors like <2 x i8> arsenm: Ideally this would handle vectors like <2 x i8>
		Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
		arsenmUnsubmitted Done Reply Inline Actions The address space check should be first arsenm: The address space check should be first
		Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
		arsenmUnsubmitted Done Reply Inline Actions getI32Ty is the wrong thing to use here arsenm: getI32Ty is the wrong thing to use here
		arsenmUnsubmitted Not Done Reply Inline Actions Move align check before DA. It would also be better to move the datalayout alignment check into a helper function checked here arsenm: Move align check before DA. It would also be better to move the datalayout alignment check into…
		Value *WidenLoad = Builder.CreateLoad(BitCast);

		int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
		arsenmUnsubmitted Done Reply Inline Actions The builder already has a getInt32Ty arsenm: The builder already has a getInt32Ty
		Type *IntNTy = Builder.getIntNTy(TySize);
		Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
		arsenmUnsubmitted Done Reply Inline Actions I.getPointerOperand arsenm: I.getPointerOperand
		Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
		I.replaceAllUsesWith(ValOrig);
		I.eraseFromParent();
		return true;
		}

		return false;
		}
		arsenmUnsubmitted Done Reply Inline Actions This should also skip volatile loads arsenm: This should also skip volatile loads

bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {		bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
bool Changed = false;		bool Changed = false;

if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&		if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
DA->isUniform(&I))		DA->isUniform(&I))
		arsenmUnsubmitted Done Reply Inline Actions You don't need an entire block of code that is mostly the same for the vector case. The non-vector case requires a bitcast as well. arsenm: You don't need an entire block of code that is mostly the same for the vector case. The non…
Changed \|= promoteUniformOpToI32(I);		Changed \|= promoteUniformOpToI32(I);

return Changed;		return Changed;
}		}

bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {		bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
bool Changed = false;		bool Changed = false;

▲ Show 20 Lines • Show All 68 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/unaligned-load-store.ll

	Show First 20 Lines • Show All 513 Lines • ▼ Show 20 Lines
	; SI: buffer_store_dwordx4			; SI: buffer_store_dwordx4
	define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {			define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
	%v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1			%v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1
	store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4			store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
	ret void			ret void
	}			}

	; SI-LABEL: {{^}}constant_align4_load_i8:			; SI-LABEL: {{^}}constant_align4_load_i8:
	; SI: buffer_load_ubyte			; SI: s_load_dword
	; SI: buffer_store_byte			; SI: buffer_store_byte
	define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {			define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
	%v = load i8, i8 addrspace(2)* %p, align 4			%v = load i8, i8 addrspace(2)* %p, align 4
	store i8 %v, i8 addrspace(1)* %r, align 4			store i8 %v, i8 addrspace(1)* %r, align 4
	ret void			ret void
	}			}

	; SI-LABEL: {{^}}constant_align2_load_i8:			; SI-LABEL: {{^}}constant_align2_load_i8:
	▲ Show 20 Lines • Show All 74 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll

This file was added.

				; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s \| FileCheck -check-prefix=OPT %s

				arsenmUnsubmitted Done Reply Inline Actions These are IR check lines, so shouldn't use GCN. You also don't need or use the HSA check prefix arsenm: These are IR check lines, so shouldn't use GCN. You also don't need or use the HSA check prefix
				declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0

				arsenmUnsubmitted Done Reply Inline Actions These should be running just this pass with opt arsenm: These should be running just this pass with opt
				; OPT-LABEL: @constant_load_i1
				; OPT: load i1
				arsenmUnsubmitted Done Reply Inline Actions Don't use FUNC, you don't have this as a check prefix arsenm: Don't use FUNC, you don't have this as a check prefix
				; OPT-NEXT: store i1
				define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 {
				%val = load i1, i1 addrspace(2)* %in
				store i1 %val, i1 addrspace(1)* %out
				ret void
				}

				arsenmUnsubmitted Done Reply Inline Actions This should not be converted because you don't know if it's 4 byte aligned arsenm: This should not be converted because you don't know if it's 4 byte aligned
				; OPT-LABEL: @constant_load_i1_align2
				; OPT: load i1
				; OPT-NEXT: store
				define amdgpu_kernel void @constant_load_i1_align2(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 {
				%val = load i1, i1 addrspace(2)* %in, align 2
				store i1 %val, i1 addrspace(1)* %out, align 2
				ret void
				}

				; OPT-LABEL: @constant_load_i1_align4
				; OPT: bitcast
				; OPT-NEXT: load i32
				; OPT-NEXT: trunc
				; OPT-NEXT: store
				define amdgpu_kernel void @constant_load_i1_align4(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 {
				arsenmUnsubmitted Done Reply Inline Actions This isn't checking the relevant parts arsenm: This isn't checking the relevant parts
				%val = load i1, i1 addrspace(2)* %in, align 4
				store i1 %val, i1 addrspace(1)* %out, align 4
				ret void
				}

				; OPT-LABEL: @constant_load_i8
				; OPT: load i8
				; OPT-NEXT: store
				define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
				%val = load i8, i8 addrspace(2)* %in
				store i8 %val, i8 addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @constant_load_i8_align2
				; OPT: load i8
				; OPT-NEXT: store
				define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
				%val = load i8, i8 addrspace(2)* %in, align 2
				store i8 %val, i8 addrspace(1)* %out, align 2
				ret void
				arsenmUnsubmitted Done Reply Inline Actions Needs tests with half and <2 x half>, i1, and maybe another exotic size. I'm pretty sure this will assert for half as is now. Also need tests with various alignments and volatile arsenm: Needs tests with half and <2 x half>, i1, and maybe another exotic size. I'm pretty sure this…
				arsenmUnsubmitted Done Reply Inline Actions Also would be good to have a test specifically loading from the dispatch packet like happens in the workitem ID calculation arsenm: Also would be good to have a test specifically loading from the dispatch packet like happens in…
				}

				; OPT-LABEL: @constant_load_i8align4
				; OPT: bitcast
				; OPT-NEXT: load i32
				; OPT-NEXT: trunc
				; OPT-NEXT: store
				define amdgpu_kernel void @constant_load_i8align4(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
				%val = load i8, i8 addrspace(2)* %in, align 4
				store i8 %val, i8 addrspace(1)* %out, align 4
				ret void
				}


				; OPT-LABEL: @constant_load_v2i8
				; OPT: load <2 x i8>
				; OPT-NEXT: store
				define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
				%ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
				store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @constant_load_v2i8_align4
				; OPT: bitcast
				; OPT-NEXT: load i32
				; OPT-NEXT: trunc
				; OPT-NEXT: bitcast
				; OPT-NEXT: store
				define amdgpu_kernel void @constant_load_v2i8_align4(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
				%ld = load <2 x i8>, <2 x i8> addrspace(2)* %in, align 4
				store <2 x i8> %ld, <2 x i8> addrspace(1)* %out, align 4
				ret void
				}

				; OPT-LABEL: @constant_load_v3i8
				; OPT: bitcast <3 x i8>
				; OPT-NEXT: load i32, i32 addrspace(2)
				; OPT-NEXT: trunc i32
				; OPT-NEXT: bitcast i24
				; OPT-NEXT: store <3 x i8>
				define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
				arsenmUnsubmitted Done Reply Inline Actions This should check the integer type/operands arsenm: This should check the integer type/operands
				%ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
				store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @constant_load_v3i8_align4
				; OPT: bitcast <3 x i8>
				; OPT-NEXT: load i32, i32 addrspace(2)
				; OPT-NEXT: trunc i32
				; OPT-NEXT: bitcast i24
				; OPT-NEXT: store <3 x i8>
				define amdgpu_kernel void @constant_load_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
				%ld = load <3 x i8>, <3 x i8> addrspace(2)* %in, align 4
				store <3 x i8> %ld, <3 x i8> addrspace(1)* %out, align 4
				ret void
				}

				; OPT-LABEL: @constant_load_i16
				; OPT: load i16
				; OPT: sext
				; OPT-NEXT: store
				define amdgpu_kernel void @constant_load_i16(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
				%ld = load i16, i16 addrspace(2)* %in
				%ext = sext i16 %ld to i32
				store i32 %ext, i32 addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @constant_load_i16_align4
				; OPT: bitcast
				; OPT-NEXT: load i32
				; OPT-NEXT: trunc
				; OPT-NEXT: sext
				; OPT-NEXT: store
				define amdgpu_kernel void @constant_load_i16_align4(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
				%ld = load i16, i16 addrspace(2)* %in, align 4
				%ext = sext i16 %ld to i32
				store i32 %ext, i32 addrspace(1)* %out, align 4
				ret void
				}

				; OPT-LABEL: @constant_load_f16
				; OPT: load half
				arsenmUnsubmitted Done Reply Inline Actions _f16 is the naming convention arsenm: _f16 is the naming convention
				; OPT-NEXT: store
				define amdgpu_kernel void @constant_load_f16(half addrspace(1)* %out, half addrspace(2)* %in) #0 {
				%ld = load half, half addrspace(2)* %in
				store half %ld, half addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @constant_load_v2f16
				; OPT: load <2 x half>
				; OPT-NEXT: store
				define amdgpu_kernel void @constant_load_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %in) #0 {
				%ld = load <2 x half>, <2 x half> addrspace(2)* %in
				arsenmUnsubmitted Done Reply Inline Actions _v2f16 arsenm: _v2f16
				store <2 x half> %ld, <2 x half> addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @load_volatile
				; OPT: load volatile i16
				; OPT-NEXT: store
				define amdgpu_kernel void @load_volatile(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
				%a = load volatile i16, i16 addrspace(2)* %in
				store i16 %a, i16 addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @constant_load_v2i8_volatile
				; OPT: load volatile <2 x i8>
				; OPT-NEXT: store
				arsenmUnsubmitted Done Reply Inline Actions Needs to check the type arsenm: Needs to check the type
				define amdgpu_kernel void @constant_load_v2i8_volatile(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
				%ld = load volatile <2 x i8>, <2 x i8> addrspace(2)* %in
				store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @constant_load_v2i8_addrspace1
				; OPT: load <2 x i8>
				; OPT-NEXT: store
				define amdgpu_kernel void @constant_load_v2i8_addrspace1(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
				%ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
				arsenmUnsubmitted Done Reply Inline Actions Spelling _addrespace1 arsenm: Spelling _addrespace1
				store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @use_dispatch_ptr
				; OPT: bitcast
				; OPT-NEXT: load i32
				; OPT-NEXT: trunc
				; OPT-NEXT: zext
				; OPT-NEXT: store
				define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
				%dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
				%val = load i8, i8 addrspace(2)* %dispatch.ptr, align 4
				%ld = zext i8 %val to i32
				store i32 %ld, i32 addrspace(1)* %ptr
				ret void
				}

				attributes #0 = { nounwind }

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU : Widen extending scalar loads to 32-bits
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 108113

lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

test/CodeGen/AMDGPU/unaligned-load-store.ll

test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU : Widen extending scalar loads to 32-bitsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 108113

lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

test/CodeGen/AMDGPU/unaligned-load-store.ll

test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll

AMDGPU : Widen extending scalar loads to 32-bits
ClosedPublic