Diff 72884

llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Show All 33 Lines
class AMDGPUCodeGenPrepare : public FunctionPass,		class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {		public InstVisitor<AMDGPUCodeGenPrepare, bool> {
const GCNTargetMachine *TM;		const GCNTargetMachine *TM;
const SISubtarget *ST;		const SISubtarget *ST;
DivergenceAnalysis *DA;		DivergenceAnalysis *DA;
Module *Mod;		Module *Mod;
bool HasUnsafeFPMath;		bool HasUnsafeFPMath;

		/// \brief Copies exact/nsw/nuw flags (if any) from binary operator \p I to
		/// binary operator \p V.
		///
		/// \returns Binary operator \p V.
		Value copyFlags(const BinaryOperator &I, Value V) const;

		/// \returns Equivalent 16 bit integer type for given 32 bit integer type
		/// \p T.
		Type getI16Ty(IRBuilder<> &B, const Type T) const;

		/// \returns Equivalent 32 bit integer type for given 16 bit integer type
		/// \p T.
		Type getI32Ty(IRBuilder<> &B, const Type T) const;

		/// \returns True if the base element of type \p T is 16 bit integer, false
		/// otherwise.
		bool isI16Ty(const Type *T) const;

		/// \returns True if the base element of type \p T is 32 bit integer, false
		/// otherwise.
		bool isI32Ty(const Type *T) const;

		/// \returns True if binary operation \p I is a signed binary operation, false
		/// otherwise.
		bool isSigned(const BinaryOperator &I) const;

		/// \returns True if the condition of 'select' operation \p I comes from a
		/// signed 'icmp' operation, false otherwise.
		bool isSigned(const SelectInst &I) const;

		/// \brief Promotes uniform 16 bit binary operation \p I to equivalent 32 bit
		/// binary operation by sign or zero extending operands to 32 bits, replacing
		/// 16 bit operation with equivalent 32 bit operation, and truncating the
		/// result of 32 bit operation back to 16 bits. 16 bit division operation is
		/// not promoted.
		///
		/// \returns True if 16 bit binary operation is promoted to equivalent 32 bit
		/// binary operation, false otherwise.
		bool promoteUniformI16OpToI32Op(BinaryOperator &I) const;

		/// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp'
		/// operation by sign or zero extending operands to 32 bits, and replacing 16
		/// bit operation with 32 bit operation.
		///
		/// \returns True.
		bool promoteUniformI16OpToI32Op(ICmpInst &I) const;

		/// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select'
		/// operation by sign or zero extending operands to 32 bits, replacing 16 bit
		/// operation with 32 bit operation, and truncating the result of 32 bit
		/// operation back to 16 bits.
		///
		/// \returns True.
		bool promoteUniformI16OpToI32Op(SelectInst &I) const;

public:		public:
static char ID;		static char ID;
AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :		AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
FunctionPass(ID),		FunctionPass(ID),
TM(static_cast<const GCNTargetMachine *>(TM)),		TM(static_cast<const GCNTargetMachine *>(TM)),
ST(nullptr),		ST(nullptr),
DA(nullptr),		DA(nullptr),
Mod(nullptr),		Mod(nullptr),
HasUnsafeFPMath(false) { }		HasUnsafeFPMath(false) { }

bool visitFDiv(BinaryOperator &I);		bool visitFDiv(BinaryOperator &I);

bool visitInstruction(Instruction &I) {		bool visitInstruction(Instruction &I) { return false; }
return false;		bool visitBinaryOperator(BinaryOperator &I);
}		bool visitICmpInst(ICmpInst &I);
		bool visitSelectInst(SelectInst &I);

bool doInitialization(Module &M) override;		bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;		bool runOnFunction(Function &F) override;

const char *getPassName() const override {		const char *getPassName() const override {
return "AMDGPU IR optimizations";		return "AMDGPU IR optimizations";
}		}

void getAnalysisUsage(AnalysisUsage &AU) const override {		void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DivergenceAnalysis>();		AU.addRequired<DivergenceAnalysis>();
AU.setPreservesAll();		AU.setPreservesAll();
}		}
};		};

} // End anonymous namespace		} // End anonymous namespace

		Value *AMDGPUCodeGenPrepare::copyFlags(
		const BinaryOperator &I, Value *V) const {
		assert(isa<BinaryOperator>(V) && "V must be binary operator");

		BinaryOperator *BinOp = cast<BinaryOperator>(V);
		if (isa<OverflowingBinaryOperator>(BinOp)) {
		BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
		BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
		} else if (isa<PossiblyExactOperator>(BinOp)) {
		BinOp->setIsExact(I.isExact());
		}

		return V;
		}

		Type AMDGPUCodeGenPrepare::getI16Ty(IRBuilder<> &B, const Type T) const {
		assert(isI32Ty(T) && "T must be 32 bits");

		if (T->isIntegerTy())
		return B.getInt16Ty();
		return VectorType::get(B.getInt16Ty(), cast<VectorType>(T)->getNumElements());
		}

		Type AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type T) const {
		assert(isI16Ty(T) && "T must be 16 bits");

		if (T->isIntegerTy())
		return B.getInt32Ty();
		return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
		}

		bool AMDGPUCodeGenPrepare::isI16Ty(const Type *T) const {
		if (T->isIntegerTy(16))
		return true;
		if (!T->isVectorTy())
		return false;
		return cast<VectorType>(T)->getElementType()->isIntegerTy(16);
		}

		bool AMDGPUCodeGenPrepare::isI32Ty(const Type *T) const {
		if (T->isIntegerTy(32))
		return true;
		if (!T->isVectorTy())
		return false;
		return cast<VectorType>(T)->getElementType()->isIntegerTy(32);
		}

		bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
		return I.getOpcode() == Instruction::SDiv \|\|
		I.getOpcode() == Instruction::SRem;
		}

		bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
		return isa<ICmpInst>(I.getOperand(0)) ?
		cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
		}

		bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const {
		assert(isI16Ty(I.getType()) && "Op must be 16 bits");

		if (I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction::UDiv)
		return false;

		IRBuilder<> Builder(&I);
		Builder.SetCurrentDebugLocation(I.getDebugLoc());

		Type *I32Ty = getI32Ty(Builder, I.getType());
		Value *ExtOp0 = nullptr;
		Value *ExtOp1 = nullptr;
		Value *ExtRes = nullptr;
		Value *TruncRes = nullptr;

		if (isSigned(I)) {
		ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
		ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
		} else {
		ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
		ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
		}
		ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
		TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType()));

		I.replaceAllUsesWith(TruncRes);
		I.eraseFromParent();

		return true;
		}

		bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(ICmpInst &I) const {
		assert(isI16Ty(I.getOperand(0)->getType()) && "Op0 must be 16 bits");
		assert(isI16Ty(I.getOperand(1)->getType()) && "Op1 must be 16 bits");

		IRBuilder<> Builder(&I);
		Builder.SetCurrentDebugLocation(I.getDebugLoc());

		Type *I32TyOp0 = getI32Ty(Builder, I.getOperand(0)->getType());
		Type *I32TyOp1 = getI32Ty(Builder, I.getOperand(1)->getType());
		Value *ExtOp0 = nullptr;
		Value *ExtOp1 = nullptr;
		Value *NewICmp = nullptr;

		if (I.isSigned()) {
		ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32TyOp0);
		ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32TyOp1);
		} else {
		ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32TyOp0);
		ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32TyOp1);
		}
		NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);

		I.replaceAllUsesWith(NewICmp);
		I.eraseFromParent();

		return true;
		}

		bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(SelectInst &I) const {
		assert(isI16Ty(I.getType()) && "Op must be 16 bits");

		IRBuilder<> Builder(&I);
		Builder.SetCurrentDebugLocation(I.getDebugLoc());

		Type *I32Ty = getI32Ty(Builder, I.getType());
		Value *ExtOp1 = nullptr;
		Value *ExtOp2 = nullptr;
		Value *ExtRes = nullptr;
		Value *TruncRes = nullptr;

		if (isSigned(I)) {
		ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
		ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
		} else {
		ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
		ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
		}
		ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
		TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType()));

		I.replaceAllUsesWith(TruncRes);
		I.eraseFromParent();

		return true;
		}

static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {		static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);		const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
if (!CNum)		if (!CNum)
return false;		return false;

// Reciprocal f32 is handled separately without denormals.		// Reciprocal f32 is handled separately without denormals.
return UnsafeDiv \|\| CNum->isExactlyValue(+1.0);		return UnsafeDiv \|\| CNum->isExactlyValue(+1.0);
}		}
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
return true;		return true;
}		}

static bool hasUnsafeFPMath(const Function &F) {		static bool hasUnsafeFPMath(const Function &F) {
Attribute Attr = F.getFnAttribute("unsafe-fp-math");		Attribute Attr = F.getFnAttribute("unsafe-fp-math");
return Attr.getValueAsString() == "true";		return Attr.getValueAsString() == "true";
}		}

		bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
		bool Changed = false;

		// TODO: Should we promote smaller types that will be legalized to i16?
		if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
		Changed \|= promoteUniformI16OpToI32Op(I);

		return Changed;
		}

		bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
		bool Changed = false;

		// TODO: Should we promote smaller types that will be legalized to i16?
		if (ST->has16BitInsts() && isI16Ty(I.getOperand(0)->getType()) &&
		isI16Ty(I.getOperand(1)->getType()) && DA->isUniform(&I))
		Changed \|= promoteUniformI16OpToI32Op(I);

		return Changed;
		}

		bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
		bool Changed = false;

		// TODO: Should we promote smaller types that will be legalized to i16?
		if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
		Changed \|= promoteUniformI16OpToI32Op(I);

		return Changed;
		}

bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {		bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Mod = &M;		Mod = &M;
return false;		return false;
}		}

bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {		bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
if (!TM \|\| skipFunction(F))		if (!TM \|\| skipFunction(F))
return false;		return false;
Show All 29 Lines

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp

	Show First 20 Lines • Show All 534 Lines • ▼ Show 20 Lines
	bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,			bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {			Type *Ty) const {
	// FIXME: Could be smarter if called for vector constants.			// FIXME: Could be smarter if called for vector constants.
	return true;			return true;
	}			}

	bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {			bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {

				// i16 is not desirable unless it is a load or a store.
				if (VT == MVT::i16 && Op != ISD::LOAD && Op != ISD::STORE)
				return false;

	// SimplifySetCC uses this function to determine whether or not it should			// SimplifySetCC uses this function to determine whether or not it should
	// create setcc with i1 operands. We don't have instructions for i1 setcc.			// create setcc with i1 operands. We don't have instructions for i1 setcc.
	if (VT == MVT::i1 && Op == ISD::SETCC)			if (VT == MVT::i1 && Op == ISD::SETCC)
	return false;			return false;

	return TargetLowering::isTypeDesirableForOp(Op, VT);			return TargetLowering::isTypeDesirableForOp(Op, VT);
	}			}

	▲ Show 20 Lines • Show All 3,331 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll

				; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s \| FileCheck %s
				; RUN: opt -S -amdgpu-codegenprepare %s \| FileCheck -check-prefix=NOOP %s
				; Make sure this doesn't crash with no triple

				; NOOP-LABEL: @noop_fdiv_fpmath(
				; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
				define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
				%md.25ulp = fdiv float %a, %b, !fpmath !0
				store volatile float %md.25ulp, float addrspace(1)* %out
				ret void
				}

				; CHECK-LABEL: @fdiv_fpmath(
				; CHECK: %no.md = fdiv float %a, %b{{$}}
				; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
				; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
				; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
				; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
				; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
				; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
				define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
				%no.md = fdiv float %a, %b
				store volatile float %no.md, float addrspace(1)* %out

				%md.half.ulp = fdiv float %a, %b, !fpmath !1
				store volatile float %md.half.ulp, float addrspace(1)* %out

				%md.1ulp = fdiv float %a, %b, !fpmath !2
				store volatile float %md.1ulp, float addrspace(1)* %out

				%md.25ulp = fdiv float %a, %b, !fpmath !0
				store volatile float %md.25ulp, float addrspace(1)* %out

				%md.3ulp = fdiv float %a, %b, !fpmath !3
				store volatile float %md.3ulp, float addrspace(1)* %out

				%fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
				store volatile float %fast.md.25ulp, float addrspace(1)* %out

				%arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
				store volatile float %arcp.md.25ulp, float addrspace(1)* %out

				ret void
				}

				; CHECK-LABEL: @rcp_fdiv_fpmath(
				; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
				; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0
				; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
				; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}}
				; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
				; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
				; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
				define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
				%no.md = fdiv float 1.0, %x
				store volatile float %no.md, float addrspace(1)* %out

				%md.25ulp = fdiv float 1.0, %x, !fpmath !0
				store volatile float %md.25ulp, float addrspace(1)* %out

				%md.half.ulp = fdiv float 1.0, %x, !fpmath !1
				store volatile float %md.half.ulp, float addrspace(1)* %out

				%arcp.no.md = fdiv arcp float 1.0, %x
				store volatile float %arcp.no.md, float addrspace(1)* %out

				%arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0
				store volatile float %arcp.25ulp, float addrspace(1)* %out

				%fast.no.md = fdiv fast float 1.0, %x
				store volatile float %fast.no.md, float addrspace(1)* %out

				%fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0
				store volatile float %fast.25ulp, float addrspace(1)* %out

				ret void
				}

				; CHECK-LABEL: @fdiv_fpmath_vector(
				; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}}
				; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
				; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2

				; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
				; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
				; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0
				; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0
				; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
				; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
				; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
				; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
				define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
				%no.md = fdiv <2 x float> %a, %b
				store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out

				%md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
				store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out

				%md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
				store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out

				%md.25ulp = fdiv <2 x float> %a, %b, !fpmath !0
				store volatile <2 x float> %md.25ulp, <2 x float> addrspace(1)* %out

				ret void
				}

				; CHECK-LABEL: @rcp_fdiv_fpmath_vector(
				; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
				; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
				; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
				; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}

				; CHECK: extractelement <2 x float> %x
				; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
				; CHECK: extractelement <2 x float> %x
				; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
				; CHECK: store volatile <2 x float> %arcp.25ulp

				; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
				; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
				; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
				define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
				%no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
				store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out

				%md.half.ulp = fdiv <2 x float> <float 1.0, float 1.0>, %x, !fpmath !1
				store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out

				%arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
				store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out

				%fast.no.md = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
				store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out

				%arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
				store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out

				%fast.25ulp = fdiv fast <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
				store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out

				ret void
				}

				; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat(
				; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
				; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
				; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}

				; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
				; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
				; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
				; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0
				; CHECK: store volatile <2 x float> %arcp.25ulp

				; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
				; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
				; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
				; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
				; CHECK: store volatile <2 x float> %fast.25ulp
				define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
				%no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
				store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out

				%arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x
				store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out

				%fast.no.md = fdiv fast <2 x float> <float 1.0, float 2.0>, %x
				store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out

				%arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
				store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out

				%fast.25ulp = fdiv fast <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
				store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out

				ret void
				}

				; FIXME: Should be able to get fdiv for 1.0 component
				; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
				; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
				; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
				; CHECK: store volatile <2 x float> %arcp.25ulp

				; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
				; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
				; CHECK: store volatile <2 x float> %fast.25ulp
				define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
				%x.insert = insertelement <2 x float> %x, float 1.0, i32 0

				%arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
				store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out

				%fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
				store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out

				ret void
				}

				; CHECK-LABEL: @fdiv_fpmath_f32_denormals(
				; CHECK: %no.md = fdiv float %a, %b{{$}}
				; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
				; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
				; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
				; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
				; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
				; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
				define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
				%no.md = fdiv float %a, %b
				store volatile float %no.md, float addrspace(1)* %out

				%md.half.ulp = fdiv float %a, %b, !fpmath !1
				store volatile float %md.half.ulp, float addrspace(1)* %out

				%md.1ulp = fdiv float %a, %b, !fpmath !2
				store volatile float %md.1ulp, float addrspace(1)* %out

				%md.25ulp = fdiv float %a, %b, !fpmath !0
				store volatile float %md.25ulp, float addrspace(1)* %out

				%md.3ulp = fdiv float %a, %b, !fpmath !3
				store volatile float %md.3ulp, float addrspace(1)* %out

				%fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
				store volatile float %fast.md.25ulp, float addrspace(1)* %out

				%arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
				store volatile float %arcp.md.25ulp, float addrspace(1)* %out

				ret void
				}

				attributes #0 = { nounwind optnone noinline }
				attributes #1 = { nounwind }
				attributes #2 = { nounwind "target-features"="+fp32-denormals" }

				; CHECK: !0 = !{float 2.500000e+00}
				; CHECK: !1 = !{float 5.000000e-01}
				; CHECK: !2 = !{float 1.000000e+00}
				; CHECK: !3 = !{float 3.000000e+00}

				!0 = !{float 2.500000e+00}
				!1 = !{float 5.000000e-01}
				!2 = !{float 1.000000e+00}
				!3 = !{float 3.000000e+00}

llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll

				; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s \| FileCheck -check-prefix=SI %s
				; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -amdgpu-codegenprepare %s \| FileCheck -check-prefix=VI %s

				; SI-NOT: zext
				; SI-NOT: sext
				; SI-NOT: trunc

				; VI-LABEL: @add_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = add i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @add_i16(i16 %a, i16 %b) {
				%r = add i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @add_nsw_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = add nsw i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @add_nsw_i16(i16 %a, i16 %b) {
				%r = add nsw i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @add_nuw_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = add nuw i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @add_nuw_i16(i16 %a, i16 %b) {
				%r = add nuw i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @add_nuw_nsw_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @add_nuw_nsw_i16(i16 %a, i16 %b) {
				%r = add nuw nsw i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @sub_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = sub i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @sub_i16(i16 %a, i16 %b) {
				%r = sub i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @sub_nsw_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @sub_nsw_i16(i16 %a, i16 %b) {
				%r = sub nsw i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @sub_nuw_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = sub nuw i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @sub_nuw_i16(i16 %a, i16 %b) {
				%r = sub nuw i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @sub_nuw_nsw_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @sub_nuw_nsw_i16(i16 %a, i16 %b) {
				%r = sub nuw nsw i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @mul_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = mul i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @mul_i16(i16 %a, i16 %b) {
				%r = mul i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @mul_nsw_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = mul nsw i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @mul_nsw_i16(i16 %a, i16 %b) {
				%r = mul nsw i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @mul_nuw_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @mul_nuw_i16(i16 %a, i16 %b) {
				%r = mul nuw i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @mul_nuw_nsw_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @mul_nuw_nsw_i16(i16 %a, i16 %b) {
				%r = mul nuw nsw i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @urem_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = urem i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @urem_i16(i16 %a, i16 %b) {
				%r = urem i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @srem_i16(
				; VI: %[[A_32:[0-9]+]] = sext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = sext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = srem i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @srem_i16(i16 %a, i16 %b) {
				%r = srem i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @shl_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = shl i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @shl_i16(i16 %a, i16 %b) {
				%r = shl i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @shl_nsw_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = shl nsw i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @shl_nsw_i16(i16 %a, i16 %b) {
				%r = shl nsw i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @shl_nuw_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = shl nuw i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @shl_nuw_i16(i16 %a, i16 %b) {
				%r = shl nuw i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @shl_nuw_nsw_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @shl_nuw_nsw_i16(i16 %a, i16 %b) {
				%r = shl nuw nsw i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @lshr_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = lshr i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @lshr_i16(i16 %a, i16 %b) {
				%r = lshr i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @lshr_exact_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = lshr exact i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @lshr_exact_i16(i16 %a, i16 %b) {
				%r = lshr exact i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @ashr_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = ashr i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @ashr_i16(i16 %a, i16 %b) {
				%r = ashr i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @ashr_exact_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = ashr exact i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @ashr_exact_i16(i16 %a, i16 %b) {
				%r = ashr exact i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @and_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = and i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @and_i16(i16 %a, i16 %b) {
				%r = and i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @or_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = or i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @or_i16(i16 %a, i16 %b) {
				%r = or i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @xor_i16(
				; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32:[0-9]+]] = zext i16 %b to i32
				; VI: %[[R_32:[0-9]+]] = xor i32 %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
				; VI: ret i16 %[[R_16]]
				define i16 @xor_i16(i16 %a, i16 %b) {
				%r = xor i16 %a, %b
				ret i16 %r
				}

				; VI-LABEL: @select_eq_i16(
				; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
				; VI: %[[CMP:[0-9]+]] = icmp eq i32 %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
				; VI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
				; VI: ret i16 %[[SEL_16]]
				define i16 @select_eq_i16(i16 %a, i16 %b) {
				%cmp = icmp eq i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				; VI-LABEL: @select_ne_i16(
				; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
				; VI: %[[CMP:[0-9]+]] = icmp ne i32 %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
				; VI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
				; VI: ret i16 %[[SEL_16]]
				define i16 @select_ne_i16(i16 %a, i16 %b) {
				%cmp = icmp ne i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				; VI-LABEL: @select_ugt_i16(
				; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
				; VI: %[[CMP:[0-9]+]] = icmp ugt i32 %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
				; VI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
				; VI: ret i16 %[[SEL_16]]
				define i16 @select_ugt_i16(i16 %a, i16 %b) {
				%cmp = icmp ugt i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				; VI-LABEL: @select_uge_i16(
				; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
				; VI: %[[CMP:[0-9]+]] = icmp uge i32 %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
				; VI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
				; VI: ret i16 %[[SEL_16]]
				define i16 @select_uge_i16(i16 %a, i16 %b) {
				%cmp = icmp uge i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				; VI-LABEL: @select_ult_i16(
				; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
				; VI: %[[CMP:[0-9]+]] = icmp ult i32 %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
				; VI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
				; VI: ret i16 %[[SEL_16]]
				define i16 @select_ult_i16(i16 %a, i16 %b) {
				%cmp = icmp ult i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				; VI-LABEL: @select_ule_i16(
				; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
				; VI: %[[CMP:[0-9]+]] = icmp ule i32 %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
				; VI: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
				; VI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
				; VI: ret i16 %[[SEL_16]]
				define i16 @select_ule_i16(i16 %a, i16 %b) {
				%cmp = icmp ule i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				; VI-LABEL: @select_sgt_i16(
				; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
				; VI: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
				; VI: %[[CMP:[0-9]+]] = icmp sgt i32 %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = sext i16 %a to i32
				; VI: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
				; VI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
				; VI: ret i16 %[[SEL_16]]
				define i16 @select_sgt_i16(i16 %a, i16 %b) {
				%cmp = icmp sgt i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				; VI-LABEL: @select_sge_i16(
				; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
				; VI: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
				; VI: %[[CMP:[0-9]+]] = icmp sge i32 %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = sext i16 %a to i32
				; VI: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
				; VI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
				; VI: ret i16 %[[SEL_16]]
				define i16 @select_sge_i16(i16 %a, i16 %b) {
				%cmp = icmp sge i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				; VI-LABEL: @select_slt_i16(
				; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
				; VI: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
				; VI: %[[CMP:[0-9]+]] = icmp slt i32 %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = sext i16 %a to i32
				; VI: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
				; VI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
				; VI: ret i16 %[[SEL_16]]
				define i16 @select_slt_i16(i16 %a, i16 %b) {
				%cmp = icmp slt i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				; VI-LABEL: @select_sle_i16(
				; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
				; VI: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
				; VI: %[[CMP:[0-9]+]] = icmp sle i32 %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = sext i16 %a to i32
				; VI: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
				; VI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
				; VI: ret i16 %[[SEL_16]]
				define i16 @select_sle_i16(i16 %a, i16 %b) {
				%cmp = icmp sle i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				; VI-LABEL: @add_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = add <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = add <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @add_nsw_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = add nsw <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = add nsw <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @add_nuw_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = add nuw <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = add nuw <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @add_nuw_nsw_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = add nuw nsw <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @sub_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = sub <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @sub_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = sub <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @sub_nsw_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = sub nsw <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @sub_nuw_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = sub nuw <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = sub nuw <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @sub_nuw_nsw_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = sub nuw nsw <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @mul_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = mul <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = mul <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @mul_nsw_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = mul nsw <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = mul nsw <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @mul_nuw_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = mul nuw <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @mul_nuw_nsw_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = mul nuw nsw <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @urem_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = urem <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @urem_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = urem <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @srem_3xi16(
				; VI: %[[A_32:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = srem <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @srem_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = srem <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @shl_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = shl <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = shl <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @shl_nsw_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = shl nsw <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = shl nsw <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @shl_nuw_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = shl nuw <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = shl nuw <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @shl_nuw_nsw_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = shl nuw nsw <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @lshr_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = lshr <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = lshr <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @lshr_exact_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = lshr exact <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = lshr exact <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @ashr_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = ashr <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = ashr <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @ashr_exact_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = ashr exact <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = ashr exact <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @and_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = and <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @and_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = and <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @or_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = or <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @or_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = or <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @xor_3xi16(
				; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[R_32:[0-9]+]] = xor <3 x i32> %[[A_32]], %[[B_32]]
				; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[R_16]]
				define <3 x i16> @xor_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%r = xor <3 x i16> %a, %b
				ret <3 x i16> %r
				}

				; VI-LABEL: @select_eq_3xi16(
				; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[CMP:[0-9]+]] = icmp eq <3 x i32> %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[SEL_16]]
				define <3 x i16> @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%cmp = icmp eq <3 x i16> %a, %b
				%sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
				ret <3 x i16> %sel
				}

				; VI-LABEL: @select_ne_3xi16(
				; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[CMP:[0-9]+]] = icmp ne <3 x i32> %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[SEL_16]]
				define <3 x i16> @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%cmp = icmp ne <3 x i16> %a, %b
				%sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
				ret <3 x i16> %sel
				}

				; VI-LABEL: @select_ugt_3xi16(
				; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[CMP:[0-9]+]] = icmp ugt <3 x i32> %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[SEL_16]]
				define <3 x i16> @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%cmp = icmp ugt <3 x i16> %a, %b
				%sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
				ret <3 x i16> %sel
				}

				; VI-LABEL: @select_uge_3xi16(
				; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[CMP:[0-9]+]] = icmp uge <3 x i32> %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[SEL_16]]
				define <3 x i16> @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%cmp = icmp uge <3 x i16> %a, %b
				%sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
				ret <3 x i16> %sel
				}

				; VI-LABEL: @select_ult_3xi16(
				; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[CMP:[0-9]+]] = icmp ult <3 x i32> %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[SEL_16]]
				define <3 x i16> @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%cmp = icmp ult <3 x i16> %a, %b
				%sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
				ret <3 x i16> %sel
				}

				; VI-LABEL: @select_ule_3xi16(
				; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[CMP:[0-9]+]] = icmp ule <3 x i32> %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
				; VI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[SEL_16]]
				define <3 x i16> @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%cmp = icmp ule <3 x i16> %a, %b
				%sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
				ret <3 x i16> %sel
				}

				; VI-LABEL: @select_sgt_3xi16(
				; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
				; VI: %[[CMP:[0-9]+]] = icmp sgt <3 x i32> %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
				; VI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[SEL_16]]
				define <3 x i16> @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%cmp = icmp sgt <3 x i16> %a, %b
				%sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
				ret <3 x i16> %sel
				}

				; VI-LABEL: @select_sge_3xi16(
				; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
				; VI: %[[CMP:[0-9]+]] = icmp sge <3 x i32> %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
				; VI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[SEL_16]]
				define <3 x i16> @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%cmp = icmp sge <3 x i16> %a, %b
				%sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
				ret <3 x i16> %sel
				}

				; VI-LABEL: @select_slt_3xi16(
				; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
				; VI: %[[CMP:[0-9]+]] = icmp slt <3 x i32> %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
				; VI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[SEL_16]]
				define <3 x i16> @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%cmp = icmp slt <3 x i16> %a, %b
				%sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
				ret <3 x i16> %sel
				}

				; VI-LABEL: @select_sle_3xi16(
				; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
				; VI: %[[CMP:[0-9]+]] = icmp sle <3 x i32> %[[A_32_0]], %[[B_32_0]]
				; VI: %[[A_32_1:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
				; VI: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
				; VI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
				; VI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
				; VI: ret <3 x i16> %[[SEL_16]]
				define <3 x i16> @select_sle_3xi16(<3 x i16> %a, <3 x i16> %b) {
				%cmp = icmp sle <3 x i16> %a, %b
				%sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
				ret <3 x i16> %sel
				}

llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll

	; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s \| FileCheck %s
	; RUN: opt -S -amdgpu-codegenprepare %s \| FileCheck -check-prefix=NOOP %s
	; Make sure this doesn't crash with no triple

	; NOOP-LABEL: @noop_fdiv_fpmath(
	; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
	define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
	%md.25ulp = fdiv float %a, %b, !fpmath !0
	store volatile float %md.25ulp, float addrspace(1)* %out
	ret void
	}

	; CHECK-LABEL: @fdiv_fpmath(
	; CHECK: %no.md = fdiv float %a, %b{{$}}
	; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
	; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
	; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
	; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
	; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
	; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
	define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
	%no.md = fdiv float %a, %b
	store volatile float %no.md, float addrspace(1)* %out

	%md.half.ulp = fdiv float %a, %b, !fpmath !1
	store volatile float %md.half.ulp, float addrspace(1)* %out

	%md.1ulp = fdiv float %a, %b, !fpmath !2
	store volatile float %md.1ulp, float addrspace(1)* %out

	%md.25ulp = fdiv float %a, %b, !fpmath !0
	store volatile float %md.25ulp, float addrspace(1)* %out

	%md.3ulp = fdiv float %a, %b, !fpmath !3
	store volatile float %md.3ulp, float addrspace(1)* %out

	%fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
	store volatile float %fast.md.25ulp, float addrspace(1)* %out

	%arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
	store volatile float %arcp.md.25ulp, float addrspace(1)* %out

	ret void
	}

	; CHECK-LABEL: @rcp_fdiv_fpmath(
	; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
	; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0
	; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
	; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}}
	; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
	; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
	; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
	define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
	%no.md = fdiv float 1.0, %x
	store volatile float %no.md, float addrspace(1)* %out

	%md.25ulp = fdiv float 1.0, %x, !fpmath !0
	store volatile float %md.25ulp, float addrspace(1)* %out

	%md.half.ulp = fdiv float 1.0, %x, !fpmath !1
	store volatile float %md.half.ulp, float addrspace(1)* %out

	%arcp.no.md = fdiv arcp float 1.0, %x
	store volatile float %arcp.no.md, float addrspace(1)* %out

	%arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0
	store volatile float %arcp.25ulp, float addrspace(1)* %out

	%fast.no.md = fdiv fast float 1.0, %x
	store volatile float %fast.no.md, float addrspace(1)* %out

	%fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0
	store volatile float %fast.25ulp, float addrspace(1)* %out

	ret void
	}

	; CHECK-LABEL: @fdiv_fpmath_vector(
	; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}}
	; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
	; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2

	; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
	; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
	; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0
	; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0
	; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
	; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
	; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
	; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
	define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
	%no.md = fdiv <2 x float> %a, %b
	store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out

	%md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
	store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out

	%md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
	store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out

	%md.25ulp = fdiv <2 x float> %a, %b, !fpmath !0
	store volatile <2 x float> %md.25ulp, <2 x float> addrspace(1)* %out

	ret void
	}

	; CHECK-LABEL: @rcp_fdiv_fpmath_vector(
	; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
	; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
	; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
	; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}

	; CHECK: extractelement <2 x float> %x
	; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
	; CHECK: extractelement <2 x float> %x
	; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
	; CHECK: store volatile <2 x float> %arcp.25ulp

	; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
	; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
	; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
	define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
	%no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
	store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out

	%md.half.ulp = fdiv <2 x float> <float 1.0, float 1.0>, %x, !fpmath !1
	store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out

	%arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
	store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out

	%fast.no.md = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
	store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out

	%arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
	store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out

	%fast.25ulp = fdiv fast <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
	store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out

	ret void
	}

	; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat(
	; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
	; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
	; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}

	; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
	; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
	; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
	; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0
	; CHECK: store volatile <2 x float> %arcp.25ulp

	; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
	; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
	; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
	; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
	; CHECK: store volatile <2 x float> %fast.25ulp
	define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
	%no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
	store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out

	%arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x
	store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out

	%fast.no.md = fdiv fast <2 x float> <float 1.0, float 2.0>, %x
	store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out

	%arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
	store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out

	%fast.25ulp = fdiv fast <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
	store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out

	ret void
	}

	; FIXME: Should be able to get fdiv for 1.0 component
	; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
	; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
	; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
	; CHECK: store volatile <2 x float> %arcp.25ulp

	; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
	; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
	; CHECK: store volatile <2 x float> %fast.25ulp
	define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
	%x.insert = insertelement <2 x float> %x, float 1.0, i32 0

	%arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
	store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out

	%fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
	store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out

	ret void
	}

	; CHECK-LABEL: @fdiv_fpmath_f32_denormals(
	; CHECK: %no.md = fdiv float %a, %b{{$}}
	; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
	; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
	; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
	; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
	; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
	; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
	define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
	%no.md = fdiv float %a, %b
	store volatile float %no.md, float addrspace(1)* %out

	%md.half.ulp = fdiv float %a, %b, !fpmath !1
	store volatile float %md.half.ulp, float addrspace(1)* %out

	%md.1ulp = fdiv float %a, %b, !fpmath !2
	store volatile float %md.1ulp, float addrspace(1)* %out

	%md.25ulp = fdiv float %a, %b, !fpmath !0
	store volatile float %md.25ulp, float addrspace(1)* %out

	%md.3ulp = fdiv float %a, %b, !fpmath !3
	store volatile float %md.3ulp, float addrspace(1)* %out

	%fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
	store volatile float %fast.md.25ulp, float addrspace(1)* %out

	%arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
	store volatile float %arcp.md.25ulp, float addrspace(1)* %out

	ret void
	}

	attributes #0 = { nounwind optnone noinline }
	attributes #1 = { nounwind }
	attributes #2 = { nounwind "target-features"="+fp32-denormals" }

	; CHECK: !0 = !{float 2.500000e+00}
	; CHECK: !1 = !{float 5.000000e-01}
	; CHECK: !2 = !{float 1.000000e+00}
	; CHECK: !3 = !{float 3.000000e+00}

	!0 = !{float 2.500000e+00}
	!1 = !{float 5.000000e-01}
	!2 = !{float 1.000000e+00}
	!3 = !{float 3.000000e+00}

llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
	; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s			; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s

	declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone			declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
	declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone			declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
	declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone			declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone

	declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone			declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
	declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone			declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
	declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone			declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone

	declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone			declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
	declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone			declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
	declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone			declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone

	declare i32 @llvm.r600.read.tidig.x() nounwind readnone			declare i32 @llvm.r600.read.tidig.x() nounwind readnone

	; FUNC-LABEL: {{^}}s_ctlz_i32:			; FUNC-LABEL: {{^}}s_ctlz_i32:
	; SI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb\|0x2c}}			; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb\|0x2c}}
	; SI-DAG: s_flbit_i32_b32 [[CTLZ:s[0-9]+]], [[VAL]]			; GCN-DAG: s_flbit_i32_b32 [[CTLZ:s[0-9]+]], [[VAL]]
	; SI-DAG: v_cmp_eq_i32_e64 [[CMPZ:s\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}			; GCN-DAG: v_cmp_eq_i32_e64 [[CMPZ:s\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
	; SI-DAG: v_mov_b32_e32 [[VCTLZ:v[0-9]+]], [[CTLZ]]			; GCN-DAG: v_mov_b32_e32 [[VCTLZ:v[0-9]+]], [[CTLZ]]
	; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[VCTLZ]], 32, [[CMPZ]]			; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[VCTLZ]], 32, [[CMPZ]]
	; SI: buffer_store_dword [[RESULT]]			; GCN: buffer_store_dword [[RESULT]]
	; SI: s_endpgm			; GCN: s_endpgm

	; EG: FFBH_UINT			; EG: FFBH_UINT
	; EG: CNDE_INT			; EG: CNDE_INT
	define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {			define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
	%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone			%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
	store i32 %ctlz, i32 addrspace(1)* %out, align 4			store i32 %ctlz, i32 addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}v_ctlz_i32:			; FUNC-LABEL: {{^}}v_ctlz_i32:
	; SI: buffer_load_dword [[VAL:v[0-9]+]],			; GCN: buffer_load_dword [[VAL:v[0-9]+]],
	; SI-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]			; GCN-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
	; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]]			; GCN-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]]
	; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[CTLZ]], 32, vcc			; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[CTLZ]], 32, vcc
	; SI: buffer_store_dword [[RESULT]],			; GCN: buffer_store_dword [[RESULT]],
	; SI: s_endpgm			; GCN: s_endpgm

	; EG: FFBH_UINT			; EG: FFBH_UINT
	; EG: CNDE_INT			; EG: CNDE_INT
	define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {			define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
	%val = load i32, i32 addrspace(1)* %valptr, align 4			%val = load i32, i32 addrspace(1)* %valptr, align 4
	%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone			%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
	store i32 %ctlz, i32 addrspace(1)* %out, align 4			store i32 %ctlz, i32 addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}v_ctlz_v2i32:			; FUNC-LABEL: {{^}}v_ctlz_v2i32:
	; SI: buffer_load_dwordx2			; GCN: buffer_load_dwordx2
	; SI: v_ffbh_u32_e32			; GCN: v_ffbh_u32_e32
	; SI: v_ffbh_u32_e32			; GCN: v_ffbh_u32_e32
	; SI: buffer_store_dwordx2			; GCN: buffer_store_dwordx2
	; SI: s_endpgm			; GCN: s_endpgm

	; EG: FFBH_UINT			; EG: FFBH_UINT
	; EG: CNDE_INT			; EG: CNDE_INT
	; EG: FFBH_UINT			; EG: FFBH_UINT
	; EG: CNDE_INT			; EG: CNDE_INT
	define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {			define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
	%val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8			%val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
	%ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone			%ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
	store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8			store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}v_ctlz_v4i32:			; FUNC-LABEL: {{^}}v_ctlz_v4i32:
	; SI: buffer_load_dwordx4			; GCN: buffer_load_dwordx4
	; SI: v_ffbh_u32_e32			; GCN: v_ffbh_u32_e32
	; SI: v_ffbh_u32_e32			; GCN: v_ffbh_u32_e32
	; SI: v_ffbh_u32_e32			; GCN: v_ffbh_u32_e32
	; SI: v_ffbh_u32_e32			; GCN: v_ffbh_u32_e32
	; SI: buffer_store_dwordx4			; GCN: buffer_store_dwordx4
	; SI: s_endpgm			; GCN: s_endpgm


	; EG-DAG: FFBH_UINT			; EG-DAG: FFBH_UINT
	; EG-DAG: CNDE_INT			; EG-DAG: CNDE_INT

	; EG-DAG: FFBH_UINT			; EG-DAG: FFBH_UINT
	; EG-DAG: CNDE_INT			; EG-DAG: CNDE_INT

	; EG-DAG: FFBH_UINT			; EG-DAG: FFBH_UINT
	; EG-DAG: CNDE_INT			; EG-DAG: CNDE_INT

	; EG-DAG: FFBH_UINT			; EG-DAG: FFBH_UINT
	; EG-DAG: CNDE_INT			; EG-DAG: CNDE_INT
	define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {			define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
	%val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16			%val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
	%ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone			%ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
	store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16			store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}v_ctlz_i8:			; FUNC-LABEL: {{^}}v_ctlz_i8:
	; SI: buffer_load_ubyte [[VAL:v[0-9]+]],			; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
	; SI-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]			; GCN-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
	; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]]			; GCN-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]]
	; SI-DAG: v_cndmask_b32_e64 [[CORRECTED_FFBH:v[0-9]+]], [[FFBH]], 32, vcc			; GCN-DAG: v_cndmask_b32_e64 [[CORRECTED_FFBH:v[0-9]+]], [[FFBH]], 32, vcc
	; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[CORRECTED_FFBH]]			; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[CORRECTED_FFBH]]
	; SI: buffer_store_byte [[RESULT]],			; GCN: buffer_store_byte [[RESULT]],
	define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {			define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
	%val = load i8, i8 addrspace(1)* %valptr			%val = load i8, i8 addrspace(1)* %valptr
	%ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone			%ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
	store i8 %ctlz, i8 addrspace(1)* %out			store i8 %ctlz, i8 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}s_ctlz_i64:			; FUNC-LABEL: {{^}}s_ctlz_i64:
	; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb\|0x2c}}			; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb\|0x2c}}
	; SI-DAG: v_cmp_eq_i32_e64 vcc, s[[HI]], 0{{$}}			; GCN-DAG: v_cmp_eq_i32_e64 vcc, s[[HI]], 0{{$}}
	; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]			; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
	; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32			; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
	; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]			; GCN-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
	; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[ADD]]			; GCN-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[ADD]]
	; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]			; GCN-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
	; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]			; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
	; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}			; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
	; SI: {{buffer\|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}			; GCN: {{buffer\|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
	define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {			define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
	%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)			%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
	store i64 %ctlz, i64 addrspace(1)* %out			store i64 %ctlz, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}s_ctlz_i64_trunc:			; FUNC-LABEL: {{^}}s_ctlz_i64_trunc:
	define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {			define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
	%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)			%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
	%trunc = trunc i64 %ctlz to i32			%trunc = trunc i64 %ctlz to i32
	store i32 %trunc, i32 addrspace(1)* %out			store i32 %trunc, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}v_ctlz_i64:			; FUNC-LABEL: {{^}}v_ctlz_i64:
	; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}			; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
	; SI-DAG: {{buffer\|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}			; GCN-DAG: {{buffer\|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
	; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]			; GCN-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
	; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]			; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
	; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]			; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
	; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]			; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
	; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]]			; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]]
	; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]]			; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]]
	; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]]			; GCN-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]]
	; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc			; GCN-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc
	; SI: {{buffer\|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}			; GCN: {{buffer\|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
	define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {			define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
	%tid = call i32 @llvm.r600.read.tidig.x()			%tid = call i32 @llvm.r600.read.tidig.x()
	%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid			%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
	%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid			%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
	%val = load i64, i64 addrspace(1)* %in.gep			%val = load i64, i64 addrspace(1)* %in.gep
	%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)			%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
	store i64 %ctlz, i64 addrspace(1)* %out.gep			store i64 %ctlz, i64 addrspace(1)* %out.gep
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}v_ctlz_i64_trunc:			; FUNC-LABEL: {{^}}v_ctlz_i64_trunc:
	define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {			define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
	%tid = call i32 @llvm.r600.read.tidig.x()			%tid = call i32 @llvm.r600.read.tidig.x()
	%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid			%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
	%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid			%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
	%val = load i64, i64 addrspace(1)* %in.gep			%val = load i64, i64 addrspace(1)* %in.gep
	%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)			%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
	%trunc = trunc i64 %ctlz to i32			%trunc = trunc i64 %ctlz to i32
	store i32 %trunc, i32 addrspace(1)* %out.gep			store i32 %trunc, i32 addrspace(1)* %out.gep
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_neg1:			; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_neg1:
	; SI: buffer_load_dword [[VAL:v[0-9]+]],			; GCN: buffer_load_dword [[VAL:v[0-9]+]],
	; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]			; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
	; SI: buffer_store_dword [[RESULT]],			; GCN: buffer_store_dword [[RESULT]],
	; SI: s_endpgm			; GCN: s_endpgm
	define void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {			define void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
	%val = load i32, i32 addrspace(1)* %valptr			%val = load i32, i32 addrspace(1)* %valptr
	%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone			%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
	%cmp = icmp eq i32 %val, 0			%cmp = icmp eq i32 %val, 0
	%sel = select i1 %cmp, i32 -1, i32 %ctlz			%sel = select i1 %cmp, i32 -1, i32 %ctlz
	store i32 %sel, i32 addrspace(1)* %out			store i32 %sel, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_neg1:			; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_neg1:
	; SI: buffer_load_dword [[VAL:v[0-9]+]],			; GCN: buffer_load_dword [[VAL:v[0-9]+]],
	; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]			; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
	; SI: buffer_store_dword [[RESULT]],			; GCN: buffer_store_dword [[RESULT]],
	; SI: s_endpgm			; GCN: s_endpgm
	define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {			define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
	%val = load i32, i32 addrspace(1)* %valptr			%val = load i32, i32 addrspace(1)* %valptr
	%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone			%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
	%cmp = icmp ne i32 %val, 0			%cmp = icmp ne i32 %val, 0
	%sel = select i1 %cmp, i32 %ctlz, i32 -1			%sel = select i1 %cmp, i32 %ctlz, i32 -1
	store i32 %sel, i32 addrspace(1)* %out			store i32 %sel, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; TODO: Should be able to eliminate select here as well.			; TODO: Should be able to eliminate select here as well.
	; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_bitwidth:			; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_bitwidth:
	; SI: buffer_load_dword			; GCN: buffer_load_dword
	; SI: v_ffbh_u32_e32			; GCN: v_ffbh_u32_e32
	; SI: v_cmp			; GCN: v_cmp
	; SI: v_cndmask			; GCN: v_cndmask
	; SI: s_endpgm			; GCN: s_endpgm
	define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {			define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
	%val = load i32, i32 addrspace(1)* %valptr			%val = load i32, i32 addrspace(1)* %valptr
	%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone			%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
	%cmp = icmp eq i32 %ctlz, 32			%cmp = icmp eq i32 %ctlz, 32
	%sel = select i1 %cmp, i32 -1, i32 %ctlz			%sel = select i1 %cmp, i32 -1, i32 %ctlz
	store i32 %sel, i32 addrspace(1)* %out			store i32 %sel, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_bitwidth:			; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_bitwidth:
	; SI: buffer_load_dword			; GCN: buffer_load_dword
	; SI: v_ffbh_u32_e32			; GCN: v_ffbh_u32_e32
	; SI: v_cmp			; GCN: v_cmp
	; SI: v_cndmask			; GCN: v_cndmask
	; SI: s_endpgm			; GCN: s_endpgm
	define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {			define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
	%val = load i32, i32 addrspace(1)* %valptr			%val = load i32, i32 addrspace(1)* %valptr
	%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone			%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
	%cmp = icmp ne i32 %ctlz, 32			%cmp = icmp ne i32 %ctlz, 32
	%sel = select i1 %cmp, i32 %ctlz, i32 -1			%sel = select i1 %cmp, i32 %ctlz, i32 -1
	store i32 %sel, i32 addrspace(1)* %out			store i32 %sel, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}v_ctlz_i8_sel_eq_neg1:			; FUNC-LABEL: {{^}}v_ctlz_i8_sel_eq_neg1:
	; SI: buffer_load_ubyte [[VAL:v[0-9]+]],			; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
	; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]			; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
	; SI: buffer_store_byte [[FFBH]],			; GCN: buffer_store_byte [[FFBH]],
	define void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {			define void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
	%val = load i8, i8 addrspace(1)* %valptr			%val = load i8, i8 addrspace(1)* %valptr
	%ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone			%ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
	%cmp = icmp eq i8 %val, 0			%cmp = icmp eq i8 %val, 0
	%sel = select i1 %cmp, i8 -1, i8 %ctlz			%sel = select i1 %cmp, i8 -1, i8 %ctlz
	store i8 %sel, i8 addrspace(1)* %out			store i8 %sel, i8 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1:			; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1:
	; SI: buffer_load_ushort [[VAL:v[0-9]+]],			; SI: buffer_load_ushort [[VAL:v[0-9]+]],
	; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]			; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
	; SI: buffer_store_short [[FFBH]],			; SI: buffer_store_short [[FFBH]],
	define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {			define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
	%val = load i16, i16 addrspace(1)* %valptr			%val = load i16, i16 addrspace(1)* %valptr
	%ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone			%ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
	%cmp = icmp eq i16 %val, 0			%cmp = icmp eq i16 %val, 0
	%sel = select i1 %cmp, i16 -1, i16 %ctlz			%sel = select i1 %cmp, i16 -1, i16 %ctlz
	store i16 %sel, i16 addrspace(1)* %out			store i16 %sel, i16 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}v_ctlz_i7_sel_eq_neg1:			; FUNC-LABEL: {{^}}v_ctlz_i7_sel_eq_neg1:
	; SI: buffer_load_ubyte [[VAL:v[0-9]+]],			; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
	; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]			; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
	; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]]			; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]]
	; SI: buffer_store_byte [[TRUNC]],			; GCN: buffer_store_byte [[TRUNC]],
	define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {			define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
	%val = load i7, i7 addrspace(1)* %valptr			%val = load i7, i7 addrspace(1)* %valptr
	%ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone			%ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone
	%cmp = icmp eq i7 %val, 0			%cmp = icmp eq i7 %val, 0
	%sel = select i1 %cmp, i7 -1, i7 %ctlz			%sel = select i1 %cmp, i7 -1, i7 %ctlz
	store i7 %sel, i7 addrspace(1)* %out			store i7 %sel, i7 addrspace(1)* %out
	ret void			ret void
	}			}

llvm/trunk/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll

				; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
				; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s

				declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
				declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone

				; FUNC-LABEL: {{^}}test_umul24_i32:
				; GCN: v_mul_u32_u24
				define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
				entry:
				%0 = shl i32 %a, 8
				%a_24 = lshr i32 %0, 8
				%1 = shl i32 %b, 8
				%b_24 = lshr i32 %1, 8
				%2 = mul i32 %a_24, %b_24
				store i32 %2, i32 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}test_umul24_i16_sext:
				; SI: v_mul_u32_u24_e{{(32\|64)}} [[VI_MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
				; SI: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
				; VI: s_mul_i32 [[SI_MUL:s[0-9]]], s{{[0-9]}}, s{{[0-9]}}
				; VI: s_sext_i32_i16 s{{[0-9]}}, [[SI_MUL]]
				define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
				entry:
				%mul = mul i16 %a, %b
				%ext = sext i16 %mul to i32
				store i32 %ext, i32 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}test_umul24_i16_vgpr_sext:
				; GCN: v_mul_u32_u24_e{{(32\|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
				; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
				define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
				%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
				%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
				%ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
				%ptr_b = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.y
				%a = load i16, i16 addrspace(1)* %ptr_a
				%b = load i16, i16 addrspace(1)* %ptr_b
				%mul = mul i16 %a, %b
				%val = sext i16 %mul to i32
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}test_umul24_i16:
				; SI: s_and_b32
				; SI: v_mul_u32_u24_e32
				; SI: v_and_b32_e32
				; VI: s_mul_i32
				; VI: s_and_b32
				; VI: v_mov_b32_e32
				define void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
				entry:
				%mul = mul i16 %a, %b
				%ext = zext i16 %mul to i32
				store i32 %ext, i32 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}test_umul24_i16_vgpr:
				; GCN: v_mul_u32_u24_e32
				; GCN: v_and_b32_e32
				define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
				%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
				%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
				%ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
				%ptr_b = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.y
				%a = load i16, i16 addrspace(1)* %ptr_a
				%b = load i16, i16 addrspace(1)* %ptr_b
				%mul = mul i16 %a, %b
				%val = zext i16 %mul to i32
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}test_umul24_i8:
				; GCN: v_mul_u32_u24_e{{(32\|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
				; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
				define void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
				entry:
				%mul = mul i8 %a, %b
				%ext = sext i8 %mul to i32
				store i32 %ext, i32 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
				; GCN-NOT: and
				; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
				; GCN-NEXT: buffer_store_dword [[RESULT]]
				define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
				entry:
				%a.24 = and i32 %a, 16777215
				%b.24 = and i32 %b, 16777215
				%a.24.i64 = zext i32 %a.24 to i64
				%b.24.i64 = zext i32 %b.24 to i64
				%mul48 = mul i64 %a.24.i64, %b.24.i64
				%mul48.hi = lshr i64 %mul48, 32
				%mul24hi = trunc i64 %mul48.hi to i32
				store i32 %mul24hi, i32 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}test_umulhi24:
				; GCN-NOT: and
				; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
				; GCN-NEXT: buffer_store_dword [[RESULT]]
				define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
				entry:
				%a.24 = and i64 %a, 16777215
				%b.24 = and i64 %b, 16777215
				%mul48 = mul i64 %a.24, %b.24
				%mul48.hi = lshr i64 %mul48, 32
				%mul24.hi = trunc i64 %mul48.hi to i32
				store i32 %mul24.hi, i32 addrspace(1)* %out
				ret void
				}

				; Multiply with 24-bit inputs and 64-bit output.
				; FUNC-LABEL: {{^}}test_umul24_i64:
				; GCN-NOT: and
				; GCN-NOT: lshr
				; GCN-DAG: v_mul_u32_u24_e32
				; GCN-DAG: v_mul_hi_u32_u24_e32
				; GCN: buffer_store_dwordx2
				define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
				entry:
				%tmp0 = shl i64 %a, 40
				%a_24 = lshr i64 %tmp0, 40
				%tmp1 = shl i64 %b, 40
				%b_24 = lshr i64 %tmp1, 40
				%tmp2 = mul i64 %a_24, %b_24
				store i64 %tmp2, i64 addrspace(1)* %out
				ret void
				}

				; FIXME: Should be able to eliminate the and.
				; FUNC-LABEL: {{^}}test_umul24_i64_square:
				; GCN: s_load_dword [[A:s[0-9]+]]
				; GCN: s_and_b32 [[TRUNC:s[0-9]+]], [[A]], 0xffffff{{$}}
				; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[TRUNC]], [[TRUNC]]
				; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[TRUNC]], [[TRUNC]]
				define void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
				entry:
				%tmp0 = shl i64 %a, 40
				%a.24 = lshr i64 %tmp0, 40
				%tmp2 = mul i64 %a.24, %a.24
				store i64 %tmp2, i64 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}test_umulhi16_i32:
				; GCN: s_and_b32
				; GCN: s_and_b32
				; GCN: v_mul_u32_u24_e32 [[MUL24:v[0-9]+]]
				; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[MUL24]]
				define void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
				entry:
				%a.16 = and i32 %a, 65535
				%b.16 = and i32 %b, 65535
				%mul = mul i32 %a.16, %b.16
				%hi = lshr i32 %mul, 16
				%mulhi = trunc i32 %hi to i16
				store i16 %mulhi, i16 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}test_umul24_i33:
				; GCN: s_load_dword s
				; GCN: s_load_dword s
				; GCN-NOT: and
				; GCN-NOT: lshr
				; GCN-DAG: v_mul_u32_u24_e32 v[[MUL_LO:[0-9]+]],
				; GCN-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
				; GCN-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
				; GCN: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[HI]]{{\]}}
				define void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
				entry:
				%tmp0 = shl i33 %a, 9
				%a_24 = lshr i33 %tmp0, 9
				%tmp1 = shl i33 %b, 9
				%b_24 = lshr i33 %tmp1, 9
				%tmp2 = mul i33 %a_24, %b_24
				%ext = zext i33 %tmp2 to i64
				store i64 %ext, i64 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}test_umulhi24_i33:
				; GCN: s_load_dword s
				; GCN: s_load_dword s
				; GCN-NOT: and
				; GCN-NOT: lshr
				; GCN: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
				; GCN-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
				; GCN-NEXT: buffer_store_dword v[[HI]]
				define void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
				entry:
				%tmp0 = shl i33 %a, 9
				%a_24 = lshr i33 %tmp0, 9
				%tmp1 = shl i33 %b, 9
				%b_24 = lshr i33 %tmp1, 9
				%tmp2 = mul i33 %a_24, %b_24
				%hi = lshr i33 %tmp2, 32
				%trunc = trunc i33 %hi to i32
				store i32 %trunc, i32 addrspace(1)* %out
				ret void
				}

llvm/trunk/test/CodeGen/AMDGPU/mul_uint24-r600.ll

				; RUN: llc -march=r600 -mcpu=cayman < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s
				; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s

				; FUNC-LABEL: {{^}}test_umul24_i32:
				; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
				define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
				entry:
				%0 = shl i32 %a, 8
				%a_24 = lshr i32 %0, 8
				%1 = shl i32 %b, 8
				%b_24 = lshr i32 %1, 8
				%2 = mul i32 %a_24, %b_24
				store i32 %2, i32 addrspace(1)* %out
				ret void
				}

				; The result must be sign-extended.
				; FUNC-LABEL: {{^}}test_umul24_i16_sext:
				; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
				; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
				; EG: 16
				define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
				entry:
				%mul = mul i16 %a, %b
				%ext = sext i16 %mul to i32
				store i32 %ext, i32 addrspace(1)* %out
				ret void
				}

				; The result must be sign-extended.
				; FUNC-LABEL: {{^}}test_umul24_i8:
				; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
				; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
				define void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
				entry:
				%mul = mul i8 %a, %b
				%ext = sext i8 %mul to i32
				store i32 %ext, i32 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
				; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
				define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
				entry:
				%a.24 = and i32 %a, 16777215
				%b.24 = and i32 %b, 16777215
				%a.24.i64 = zext i32 %a.24 to i64
				%b.24.i64 = zext i32 %b.24 to i64
				%mul48 = mul i64 %a.24.i64, %b.24.i64
				%mul48.hi = lshr i64 %mul48, 32
				%mul24hi = trunc i64 %mul48.hi to i32
				store i32 %mul24hi, i32 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}test_umulhi24:
				; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
				define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
				entry:
				%a.24 = and i64 %a, 16777215
				%b.24 = and i64 %b, 16777215
				%mul48 = mul i64 %a.24, %b.24
				%mul48.hi = lshr i64 %mul48, 32
				%mul24.hi = trunc i64 %mul48.hi to i32
				store i32 %mul24.hi, i32 addrspace(1)* %out
				ret void
				}

				; Multiply with 24-bit inputs and 64-bit output.
				; FUNC-LABEL: {{^}}test_umul24_i64:
				; EG; MUL_UINT24
				; EG: MULHI
				define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
				entry:
				%tmp0 = shl i64 %a, 40
				%a_24 = lshr i64 %tmp0, 40
				%tmp1 = shl i64 %b, 40
				%b_24 = lshr i64 %tmp1, 40
				%tmp2 = mul i64 %a_24, %b_24
				store i64 %tmp2, i64 addrspace(1)* %out
				ret void
				}

llvm/trunk/test/CodeGen/AMDGPU/mul_uint24.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s
	; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s
	; RUN: llc -march=r600 -mcpu=cayman < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s

	; FUNC-LABEL: {{^}}test_umul24_i32:
	; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
	; SI: v_mul_u32_u24
	define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
	entry:
	%0 = shl i32 %a, 8
	%a_24 = lshr i32 %0, 8
	%1 = shl i32 %b, 8
	%b_24 = lshr i32 %1, 8
	%2 = mul i32 %a_24, %b_24
	store i32 %2, i32 addrspace(1)* %out
	ret void
	}

	; FUNC-LABEL: {{^}}test_umul24_i16_sext:
	; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
	; The result must be sign-extended
	; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
	; EG: 16

	; SI: v_mul_u32_u24_e{{(32\|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
	; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
	define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
	entry:
	%mul = mul i16 %a, %b
	%ext = sext i16 %mul to i32
	store i32 %ext, i32 addrspace(1)* %out
	ret void
	}

	; FUNC-LABEL: {{^}}test_umul24_i16:
	; SI: s_and_b32
	; SI: v_mul_u32_u24_e32
	; SI: v_and_b32_e32
	define void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
	entry:
	%mul = mul i16 %a, %b
	%ext = zext i16 %mul to i32
	store i32 %ext, i32 addrspace(1)* %out
	ret void
	}

	; FUNC-LABEL: {{^}}test_umul24_i8:
	; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
	; The result must be sign-extended
	; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
	; SI: v_mul_u32_u24_e{{(32\|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
	; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8

	define void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
	entry:
	%mul = mul i8 %a, %b
	%ext = sext i8 %mul to i32
	store i32 %ext, i32 addrspace(1)* %out
	ret void
	}

	; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
	; SI-NOT: and
	; SI: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
	; SI-NEXT: buffer_store_dword [[RESULT]]

	; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
	define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
	entry:
	%a.24 = and i32 %a, 16777215
	%b.24 = and i32 %b, 16777215
	%a.24.i64 = zext i32 %a.24 to i64
	%b.24.i64 = zext i32 %b.24 to i64
	%mul48 = mul i64 %a.24.i64, %b.24.i64
	%mul48.hi = lshr i64 %mul48, 32
	%mul24hi = trunc i64 %mul48.hi to i32
	store i32 %mul24hi, i32 addrspace(1)* %out
	ret void
	}

	; FUNC-LABEL: {{^}}test_umulhi24:
	; SI-NOT: and
	; SI: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
	; SI-NEXT: buffer_store_dword [[RESULT]]

	; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
	define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
	entry:
	%a.24 = and i64 %a, 16777215
	%b.24 = and i64 %b, 16777215
	%mul48 = mul i64 %a.24, %b.24
	%mul48.hi = lshr i64 %mul48, 32
	%mul24.hi = trunc i64 %mul48.hi to i32
	store i32 %mul24.hi, i32 addrspace(1)* %out
	ret void
	}

	; Multiply with 24-bit inputs and 64-bit output
	; FUNC-LABEL: {{^}}test_umul24_i64:
	; EG; MUL_UINT24
	; EG: MULHI

	; SI-NOT: and
	; SI-NOT: lshr

	; SI-DAG: v_mul_u32_u24_e32
	; SI-DAG: v_mul_hi_u32_u24_e32

	; SI: buffer_store_dwordx2
	define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
	entry:
	%tmp0 = shl i64 %a, 40
	%a_24 = lshr i64 %tmp0, 40
	%tmp1 = shl i64 %b, 40
	%b_24 = lshr i64 %tmp1, 40
	%tmp2 = mul i64 %a_24, %b_24
	store i64 %tmp2, i64 addrspace(1)* %out
	ret void
	}

	; FIXME: Should be able to eliminate the and
	; FUNC-LABEL: {{^}}test_umul24_i64_square:
	; SI: s_load_dword [[A:s[0-9]+]]
	; SI: s_and_b32 [[TRUNC:s[0-9]+]], [[A]], 0xffffff{{$}}
	; SI-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[TRUNC]], [[TRUNC]]
	; SI-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[TRUNC]], [[TRUNC]]
	define void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
	entry:
	%tmp0 = shl i64 %a, 40
	%a.24 = lshr i64 %tmp0, 40
	%tmp2 = mul i64 %a.24, %a.24
	store i64 %tmp2, i64 addrspace(1)* %out
	ret void
	}

	; FUNC-LABEL: {{^}}test_umulhi16_i32:
	; SI: s_and_b32
	; SI: s_and_b32
	; SI: v_mul_u32_u24_e32 [[MUL24:v[0-9]+]]
	; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[MUL24]]
	define void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
	entry:
	%a.16 = and i32 %a, 65535
	%b.16 = and i32 %b, 65535
	%mul = mul i32 %a.16, %b.16
	%hi = lshr i32 %mul, 16
	%mulhi = trunc i32 %hi to i16
	store i16 %mulhi, i16 addrspace(1)* %out
	ret void
	}

	; FUNC-LABEL: {{^}}test_umul24_i33:
	; SI: s_load_dword s
	; SI: s_load_dword s

	; SI-NOT: and
	; SI-NOT: lshr

	; SI-DAG: v_mul_u32_u24_e32 v[[MUL_LO:[0-9]+]],
	; SI-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
	; SI-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
	; SI: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[HI]]{{\]}}
	define void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
	entry:
	%tmp0 = shl i33 %a, 9
	%a_24 = lshr i33 %tmp0, 9
	%tmp1 = shl i33 %b, 9
	%b_24 = lshr i33 %tmp1, 9
	%tmp2 = mul i33 %a_24, %b_24
	%ext = zext i33 %tmp2 to i64
	store i64 %ext, i64 addrspace(1)* %out
	ret void
	}

	; FUNC-LABEL: {{^}}test_umulhi24_i33:
	; SI: s_load_dword s
	; SI: s_load_dword s

	; SI-NOT: and
	; SI-NOT: lshr

	; SI: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
	; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
	; SI-NEXT: buffer_store_dword v[[HI]]
	define void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
	entry:
	%tmp0 = shl i33 %a, 9
	%a_24 = lshr i33 %tmp0, 9
	%tmp1 = shl i33 %b, 9
	%b_24 = lshr i33 %tmp1, 9
	%tmp2 = mul i33 %a_24, %b_24
	%hi = lshr i33 %tmp2, 32
	%trunc = trunc i33 %hi to i32
	store i32 %trunc, i32 addrspace(1)* %out
	ret void
	}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Promote uniform i16 ops to i32 ops for targets that have 16 bit instructions
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 72884

llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll

llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll

llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll

llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll

llvm/trunk/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll

llvm/trunk/test/CodeGen/AMDGPU/mul_uint24-r600.ll

llvm/trunk/test/CodeGen/AMDGPU/mul_uint24.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Promote uniform i16 ops to i32 ops for targets that have 16 bit instructionsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 72884

llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll

llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll

llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll

llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll

llvm/trunk/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll

llvm/trunk/test/CodeGen/AMDGPU/mul_uint24-r600.ll

llvm/trunk/test/CodeGen/AMDGPU/mul_uint24.ll

[AMDGPU] Promote uniform i16 ops to i32 ops for targets that have 16 bit instructions
ClosedPublic