This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Don't combine memory intrs to v3i16
ClosedPublic

Authored by Flakebi on Jul 21 2020, 2:14 AM.

Download Raw Diff

Details

Reviewers

arsenm
piotr

Commits

rG2c659082bda6: [AMDGPU] Don't combine memory intrs to v3i16

Summary

v3i16 and v3f16 currently cannot be legalized and lowered so they should
not be emitted by inst combining.

Moved the check down to still allow extracting 1 or 2 elements via the dmask.

Fixes image intrinsics being combined to return v3x16.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

Flakebi created this revision.Jul 21 2020, 2:14 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 21 2020, 2:14 AM

Herald added subscribers: llvm-commits, kerbowa, hiraditya and 8 others. · View Herald Transcript

Looks good to me as a stop-gap solution.

Harbormaster failed remote builds in B65044: Diff 279451!Jul 21 2020, 4:03 AM

Is there any real obstacle to handling these? Even the DAG has v3i16/v3f16 types now

I’m also trying to get it working properly (currently for SDag). I think I got the legalization/widening part working but I’m still trying to figure out how to select the right instruction patterns.

The next two weeks I’m on vacation, so it will still take a while. I think Marek wants a slightly quicker fix, probably something in mesa hit this.

LGTM but I think we should stop hacking around these

This revision is now accepted and ready to land.Jul 21 2020, 10:01 AM

Closed by commit rG2c659082bda6: [AMDGPU] Don't combine memory intrs to v3i16 (authored by sebastian-ne). · Explain WhyJul 22 2020, 3:44 AM

This revision was automatically updated to reflect the committed changes.

Can you please cherry-pick this to the LLVM 11 branch?

In D84223#2181158, @mareko wrote:

Can you please cherry-pick this to the LLVM 11 branch?

https://bugs.llvm.org/show_bug.cgi?id=46893

Revision Contents

Path

Size

llvm/

lib/

Transforms/

InstCombine/

InstCombineSimplifyDemanded.cpp

11 lines

test/

Transforms/

InstCombine/

AMDGPU/

amdgcn-demanded-vector-elts.ll

58 lines

Diff 279757

llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp

Show First 20 Lines • Show All 1,024 Lines • ▼ Show 20 Lines
/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.		/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
///		///
/// Note: This only supports non-TFE/LWE image intrinsic calls; those have		/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
/// struct returns.		/// struct returns.
Value InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst II,		Value InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst II,
APInt DemandedElts,		APInt DemandedElts,
int DMaskIdx) {		int DMaskIdx) {

// FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported.
if (DMaskIdx < 0 &&
II->getType()->getScalarSizeInBits() != 32 &&
DemandedElts.getActiveBits() == 3)
return nullptr;

auto *IIVTy = cast<VectorType>(II->getType());		auto *IIVTy = cast<VectorType>(II->getType());
unsigned VWidth = IIVTy->getNumElements();		unsigned VWidth = IIVTy->getNumElements();
if (VWidth == 1)		if (VWidth == 1)
return nullptr;		return nullptr;

IRBuilderBase::InsertPointGuard Guard(Builder);		IRBuilderBase::InsertPointGuard Guard(Builder);
Builder.SetInsertPoint(II);		Builder.SetInsertPoint(II);

▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines	if (DMaskIdx < 0) {
if (DMaskVal != NewDMaskVal)		if (DMaskVal != NewDMaskVal)
Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);		Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
}		}

unsigned NewNumElts = DemandedElts.countPopulation();		unsigned NewNumElts = DemandedElts.countPopulation();
if (!NewNumElts)		if (!NewNumElts)
return UndefValue::get(II->getType());		return UndefValue::get(II->getType());

		// FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are
		// fully supported.
		if (II->getType()->getScalarSizeInBits() == 16 && NewNumElts == 3)
		return nullptr;

if (NewNumElts >= VWidth && DemandedElts.isMask()) {		if (NewNumElts >= VWidth && DemandedElts.isMask()) {
if (DMaskIdx >= 0)		if (DMaskIdx >= 0)
II->setArgOperand(DMaskIdx, Args[DMaskIdx]);		II->setArgOperand(DMaskIdx, Args[DMaskIdx]);
return nullptr;		return nullptr;
}		}

// Validate function argument and return types, extracting overloaded types		// Validate function argument and return types, extracting overloaded types
// along the way.		// along the way.
▲ Show 20 Lines • Show All 742 Lines • Show Last 20 Lines

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll

	Show First 20 Lines • Show All 2,959 Lines • ▼ Show 20 Lines
	}			}

	declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1			declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

	; --------------------------------------------------------------------			; --------------------------------------------------------------------
	; llvm.amdgcn.image.sample.cd.cl			; llvm.amdgcn.image.sample.cd.cl
	; --------------------------------------------------------------------			; --------------------------------------------------------------------

				; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(
				; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
				; CHECK-NEXT: ret half %data
				define amdgpu_ps half @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
				%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
				%elt0 = extractelement <4 x half> %data, i32 3
				ret half %elt0
				}

				; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(
				; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
				; CHECK-NEXT: ret half %data
				define amdgpu_ps half @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
				%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
				%elt0 = extractelement <4 x half> %data, i32 2
				ret half %elt0
				}

				; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(
				; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
				; CHECK-NEXT: ret half %data
				define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
				%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
				%elt0 = extractelement <4 x half> %data, i32 1
				ret half %elt0
				}

				; FIXME: Enable load shortening when full support for v3f16 has been added (should expect call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32).
				; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(
				; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
				; CHECK-NEXT: %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
				; CHECK-NEXT: ret <4 x half> %res
				define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
				%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
				%res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
				ret <4 x half> %res
				}

				; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(
				; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
				; CHECK-NEXT: %res = shufflevector <2 x half> %data, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
				; CHECK-NEXT: ret <4 x half> %res
				define amdgpu_ps <4 x half> @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
				%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
				%res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
				ret <4 x half> %res
				}

				; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(
				; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
				; CHECK-NEXT: %res = insertelement <4 x half> undef, half %data, i64 0
				; CHECK-NEXT: ret <4 x half> %res
				define amdgpu_ps <4 x half> @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
				%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
				%res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
				ret <4 x half> %res
				}

	; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(			; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(
	; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)			; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
	; CHECK-NEXT: ret half %data			; CHECK-NEXT: ret half %data
	define amdgpu_ps half @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {			define amdgpu_ps half @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
	%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)			%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
	%elt0 = extractelement <4 x half> %data, i32 0			%elt0 = extractelement <4 x half> %data, i32 0
	ret half %elt0			ret half %elt0
	}			}
	▲ Show 20 Lines • Show All 797 Lines • Show Last 20 Lines