This is an archive of the discontinued LLVM Phabricator instance.

Convert a masked.gather of at most one element to a masked.load
AbandonedPublic

Authored by reames on Apr 22 2019, 11:57 AM.

Download Raw Diff

Details

Reviewers

craig.topper
spatel

Summary

Split off of D59703. If we have a gather with a single element, that's equivalent to a single element masked.load from the respective address.

Diff Detail

Event Timeline

reames created this revision.Apr 22 2019, 11:57 AM

Herald added a project: Restricted Project. · View Herald TranscriptApr 22 2019, 11:57 AM

Herald added subscribers: llvm-commits, bollu, mcrosier. · View Herald Transcript

reames mentioned this in D59703: Convert a masked.load of a dereferenceable address to an unconditional load.Apr 22 2019, 11:57 AM

reames marked an inline comment as done.Apr 22 2019, 12:29 PM

reames added inline comments.

test/Transforms/InstCombine/masked_intrinsics.ll
211	Note: This result shows a missed scalarization opportunity.

In the generic case, we're creating a masked load of a scalar (although as noted inline, there's apparently no support or test for that pattern). But I don't know what the codegen for a <1 x X> masked load would be (do we cmp/br around the load since it's not speculatable?). Someone else with masked load lowering knowledge (@craig.topper @efriedma ?) should also review this.

If the motivating case really is a constant mask, then I think it's best to limit this patch to that pattern alone and create the scalar load directly.

lib/Transforms/InstCombine/InstCombineCalls.cpp
1234	'countPopulation() == 1' - more direct translation of the code comment?
1251–1252	Change 'auto' to 'Value' for all of these Builder calls to avoid ambiguity.
test/Transforms/InstCombine/masked_intrinsics.ll
204–206	It's fine to show the expected follow-on transform, but we should have a minimal test for this transform alone: define <4 x double> @gather_lane0(<4 x double> %ptrs, <4 x i1> %mask, <4 x double> %passthru) { %mask_with_at_most_1_bit_set = and <4 x i1> %mask, <i1 true, i1 false, i1 false, i1 false> %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double> %ptrs, i32 4, <4 x i1> %mask_with_at_most_1_bit_set, <4 x double> %passthru) ret <4 x double> %res } But there's not enough analysis power in possiblyDemandedEltsInMask() to get this? Could at least reduce the test to this? define <4 x double> @gather_lane1(<4 x double> %ptrs, <4 x double> %passthru) { %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double> %ptrs, i32 4, <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x double> %passthru) ret <4 x double> %res }

In D60975#1476052, @spatel wrote:

In the generic case, we're creating a masked load of a scalar (although as noted inline, there's apparently no support or test for that pattern). But I don't know what the codegen for a <1 x X> masked load would be (do we cmp/br around the load since it's not speculatable?). Someone else with masked load lowering knowledge (@craig.topper @efriedma ?) should also review this.

A single <1 x X> masked load should get scalarized to a compare and branch sequence by the ScalarizeMaskedMemIntrinsic pass. This was done because the type legalizer wants to scalarize 1x vectors. So I have X86's target transform info reporting illegal for 1x vectors. Not sure if it would make sense to custom widen that instead.

RKSimon added a subscriber: RKSimon.Apr 23 2019, 1:04 PM

RKSimon added inline comments.

lib/Transforms/InstCombine/InstCombineCalls.cpp
25	include order?

Abandoning an old review I'm not going to return to any time soon.

Revision Contents

Path

Size

lib/

Transforms/

InstCombine/

InstCombineCalls.cpp

32 lines

test/

Transforms/

InstCombine/

masked_intrinsics.ll

8 lines

Diff 196108

lib/Transforms/InstCombine/InstCombineCalls.cpp

Show All 15 Lines
#include "llvm/ADT/ArrayRef.h"		#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/None.h"		#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"		#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"		#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"		#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Twine.h"		#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/AssumptionCache.h"		#include "llvm/Analysis/AssumptionCache.h"
		#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/InstructionSimplify.h"		#include "llvm/Analysis/InstructionSimplify.h"
		RKSimonUnsubmitted Not Done Reply Inline Actions include order? RKSimon: include order?
#include "llvm/Analysis/MemoryBuiltins.h"		#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Transforms/Utils/Local.h"		#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"		#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"		#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"		#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"		#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"		#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"		#include "llvm/IR/Constants.h"
▲ Show 20 Lines • Show All 1,175 Lines • ▼ Show 20 Lines	if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0),
II.setOperand(0, V);		II.setOperand(0, V);
return &II;		return &II;
}		}

return nullptr;		return nullptr;
}		}

// TODO, Obvious Missing Transforms:		// TODO, Obvious Missing Transforms:
// * Single constant active lane load -> load
// * Dereferenceable address & few lanes -> scalarize speculative load/selects		// * Dereferenceable address & few lanes -> scalarize speculative load/selects
// * Adjacent vector addresses -> masked.load		// * Adjacent vector addresses -> masked.load
// * Narrow width by halfs excluding zero/undef lanes		// * Narrow width by halfs excluding zero/undef lanes
// * Vector splat address w/known mask -> scalar load		// * Vector splat address w/known mask -> scalar load
// * Vector incrementing address -> vector masked load		// * Vector incrementing address -> vector masked load
static Instruction *simplifyMaskedGather(IntrinsicInst &II, InstCombiner &IC) {		static Instruction *simplifyMaskedGather(IntrinsicInst &II, InstCombiner &IC) {
// If the mask is all zeros, return the "passthru" argument of the gather.		// If the mask is all zeros, return the "passthru" argument of the gather.
auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));		auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
if (ConstMask && ConstMask->isNullValue())		if (!ConstMask)
		return nullptr;
		if (ConstMask->isNullValue())
return IC.replaceInstUsesWith(II, II.getArgOperand(3));		return IC.replaceInstUsesWith(II, II.getArgOperand(3));

		APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
		// If we have at most a single lane active, emit a scalar masked.load. If
		// the mask is known to be active (as opposed to simply not known inactive),
		// then the masked.load combines will convert it to a simple load.
		if (DemandedElts.isPowerOf2()) {
		spatelUnsubmitted Not Done Reply Inline Actions 'countPopulation() == 1' - more direct translation of the code comment? spatel: 'countPopulation() == 1' - more direct translation of the code comment?
		// Note: APInt indexes the bit vector from LSB to MSB, thus
		// countTrailingZeros returns the index in the vector which is set.
		unsigned Idx = DemandedElts.countTrailingZeros();
		auto &B = IC.Builder;
		auto *PtrLane = B.CreateExtractElement(II.getArgOperand(0), Idx);
		unsigned Alignment = cast<ConstantInt>(II.getArgOperand(1))->getZExtValue();
		auto *MaskLane = B.CreateExtractElement(II.getArgOperand(2), Idx);
		auto *PassThrough = II.getArgOperand(3);
		auto *PTLane = B.CreateExtractElement(PassThrough, Idx);
		// TODO: pull out a scalar masked load helper function.
		auto *PTy = PointerType::get(VectorType::get(PTLane->getType(), 1),
		PtrLane->getType()->getPointerAddressSpace());
		auto *ML = B.CreateMaskedLoad(B.CreateBitCast(PtrLane, PTy),
		Alignment,
		B.CreateVectorSplat(1, MaskLane),
		B.CreateVectorSplat(1, PTLane));
		auto *E = B.CreateExtractElement(ML, (uint64_t)0);
		auto *Res = B.CreateInsertElement(PassThrough, E, Idx);
		spatelUnsubmitted Not Done Reply Inline Actions Change 'auto' to 'Value' for all of these Builder calls to avoid ambiguity. spatel: Change 'auto' to 'Value' for all of these Builder calls to avoid ambiguity.
		return IC.replaceInstUsesWith(II, Res);
		}

return nullptr;		return nullptr;
}		}

// TODO, Obvious Missing Transforms:		// TODO, Obvious Missing Transforms:
// * Single constant active lane -> store		// * Single constant active lane -> store
// * Adjacent vector addresses -> masked.store		// * Adjacent vector addresses -> masked.store
// * Narrow store width by halfs excluding zero/undef lanes		// * Narrow store width by halfs excluding zero/undef lanes
// * Vector splat address w/known mask -> scalar store		// * Vector splat address w/known mask -> scalar store
▲ Show 20 Lines • Show All 3,593 Lines • Show Last 20 Lines

test/Transforms/InstCombine/masked_intrinsics.ll

	Show First 20 Lines • Show All 195 Lines • ▼ Show 20 Lines
	; CHECK-LABEL: @gather_onemask(			; CHECK-LABEL: @gather_onemask(
	; CHECK-NEXT: [[RES:%.]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double> [[PTRS:%.*]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x double> undef)			; CHECK-NEXT: [[RES:%.]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double> [[PTRS:%.*]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x double> undef)
	; CHECK-NEXT: ret <2 x double> [[RES]]			; CHECK-NEXT: ret <2 x double> [[RES]]
	;			;
	%res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x double> %passthru)			%res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x double> %passthru)
	ret <2 x double> %res			ret <2 x double> %res
	}			}

	define <4 x double> @gather_lane2(double* %base, double %pt) {			define <4 x double> @gather_lane2(double* %base, double %pt) {
	; CHECK-LABEL: @gather_lane2(			; CHECK-LABEL: @gather_lane2(
	; CHECK-NEXT: [[PTRS:%.]] = getelementptr double, double [[BASE:%.*]], <4 x i64> <i64 undef, i64 undef, i64 2, i64 undef>			; CHECK-NEXT: [[PTRS:%.]] = getelementptr double, double [[BASE:%.*]], <4 x i64> <i64 undef, i64 undef, i64 2, i64 undef>
				spatelUnsubmitted Not Done Reply Inline Actions It's fine to show the expected follow-on transform, but we should have a minimal test for this transform alone: define <4 x double> @gather_lane0(<4 x double> %ptrs, <4 x i1> %mask, <4 x double> %passthru) { %mask_with_at_most_1_bit_set = and <4 x i1> %mask, <i1 true, i1 false, i1 false, i1 false> %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double> %ptrs, i32 4, <4 x i1> %mask_with_at_most_1_bit_set, <4 x double> %passthru) ret <4 x double> %res } But there's not enough analysis power in possiblyDemandedEltsInMask() to get this? Could at least reduce the test to this? define <4 x double> @gather_lane1(<4 x double> %ptrs, <4 x double> %passthru) { %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double> %ptrs, i32 4, <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x double> %passthru) ret <4 x double> %res } spatel: It's fine to show the expected follow-on transform, but we should have a minimal test for this…
	; CHECK-NEXT: [[PT_V1:%.]] = insertelement <4 x double> undef, double [[PT:%.]], i64 0			; CHECK-NEXT: [[PT_V1:%.]] = insertelement <4 x double> undef, double [[PT:%.]], i64 0
	; CHECK-NEXT: [[PT_V2:%.*]] = shufflevector <4 x double> [[PT_V1]], <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 0>			; CHECK-NEXT: [[PT_V2:%.*]] = shufflevector <4 x double> [[PT_V1]], <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 0>
	; CHECK-NEXT: [[RES:%.]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double> [[PTRS]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> [[PT_V2]])			; CHECK-NEXT: [[BC:%.]] = bitcast <4 x double> [[PTRS]] to <4 x <1 x double>*>
	; CHECK-NEXT: ret <4 x double> [[RES]]			; CHECK-NEXT: [[TMP1:%.]] = extractelement <4 x <1 x double>> [[BC]], i64 2
				; CHECK-NEXT: [[UNMASKEDLOAD:%.]] = load <1 x double>, <1 x double> [[TMP1]], align 4
				reamesAuthorUnsubmitted Done Reply Inline Actions Note: This result shows a missed scalarization opportunity. reames: Note: This result shows a missed scalarization opportunity.
				; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <1 x double> [[UNMASKEDLOAD]], <1 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
				; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[PT_V2]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
				; CHECK-NEXT: ret <4 x double> [[TMP3]]
	;			;
	%ptrs = getelementptr double, double *%base, <4 x i64> <i64 0, i64 1, i64 2, i64 3>			%ptrs = getelementptr double, double *%base, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
	%pt_v1 = insertelement <4 x double> undef, double %pt, i64 0			%pt_v1 = insertelement <4 x double> undef, double %pt, i64 0
	%pt_v2 = shufflevector <4 x double> %pt_v1, <4 x double> undef, <4 x i32> zeroinitializer			%pt_v2 = shufflevector <4 x double> %pt_v1, <4 x double> undef, <4 x i32> zeroinitializer
	%res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %ptrs, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %pt_v2)			%res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %ptrs, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %pt_v2)
	ret <4 x double> %res			ret <4 x double> %res
	}			}

	▲ Show 20 Lines • Show All 58 Lines • Show Last 20 Lines