diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1030,12 +1030,6 @@ APInt DemandedElts, int DMaskIdx) { - // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported. - if (DMaskIdx < 0 && - II->getType()->getScalarSizeInBits() != 32 && - DemandedElts.getActiveBits() == 3) - return nullptr; - auto *IIVTy = cast(II->getType()); unsigned VWidth = IIVTy->getNumElements(); if (VWidth == 1) @@ -1124,6 +1118,11 @@ if (!NewNumElts) return UndefValue::get(II->getType()); + // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are + // fully supported. + if (II->getType()->getScalarSizeInBits() == 16 && NewNumElts == 3) + return nullptr; + if (NewNumElts >= VWidth && DemandedElts.isMask()) { if (DMaskIdx >= 0) II->setArgOperand(DMaskIdx, Args[DMaskIdx]); diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll @@ -2965,6 +2965,64 @@ ; llvm.amdgcn.image.sample.cd.cl ; -------------------------------------------------------------------- +; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { + %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) + %elt0 = extractelement <4 x half> %data, i32 3 + ret half %elt0 +} + +; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { + %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) + %elt0 = extractelement <4 x half> %data, i32 2 + ret half %elt0 +} + +; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { + %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) + %elt0 = extractelement <4 x half> %data, i32 1 + ret half %elt0 +} + +; FIXME: Enable load shortening when full support for v3f16 has been added (should expect call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32). +; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) +; CHECK-NEXT: %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> +; CHECK-NEXT: ret <4 x half> %res +define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { + %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) + %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> + ret <4 x half> %res +} + +; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) +; CHECK-NEXT: %res = shufflevector <2 x half> %data, <2 x half> undef, <4 x i32> +; CHECK-NEXT: ret <4 x half> %res +define amdgpu_ps <4 x half> @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { + %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) + %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> + ret <4 x half> %res +} + +; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) +; CHECK-NEXT: %res = insertelement <4 x half> undef, half %data, i64 0 +; CHECK-NEXT: ret <4 x half> %res +define amdgpu_ps <4 x half> @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { + %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) + %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> + ret <4 x half> %res +} + ; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32( ; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) ; CHECK-NEXT: ret half %data