diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1012,13 +1012,64 @@ if (VWidth == 1) return nullptr; - ConstantInt *NewDMask = nullptr; + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(II); + + // Assume the arguments are unchanged and later override them, if needed. + SmallVector Args(II->arg_begin(), II->arg_end()); if (DMaskIdx < 0) { - // Pretend that a prefix of elements is demanded to simplify the code - // below. - DemandedElts = (1 << DemandedElts.getActiveBits()) - 1; + // Buffer case. + + const unsigned ActiveBits = DemandedElts.getActiveBits(); + const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); + + // Start assuming the prefix of elements is demanded, but possibly clear + // some other bits if there are trailing zeros (unused components at front) + // and update offset. + DemandedElts = (1 << ActiveBits) - 1; + + if (UnusedComponentsAtFront > 0) { + static const unsigned InvalidOffsetIdx = 0xf; + + unsigned OffsetIdx; + switch (II->getIntrinsicID()) { + case Intrinsic::amdgcn_raw_buffer_load: + OffsetIdx = 1; + break; + case Intrinsic::amdgcn_s_buffer_load: + // If resulting type is vec3, there is no point in trimming the + // load with updated offset, as the vec3 would most likely be widened to + // vec4 anyway during lowering. + if (ActiveBits == 4 && UnusedComponentsAtFront == 1) + OffsetIdx = InvalidOffsetIdx; + else + OffsetIdx = 1; + break; + case Intrinsic::amdgcn_struct_buffer_load: + OffsetIdx = 2; + break; + default: + // TODO: handle tbuffer* intrinsics. + OffsetIdx = InvalidOffsetIdx; + break; + } + + if (OffsetIdx != InvalidOffsetIdx) { + // Clear demanded bits and update the offset. + DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); + auto *Offset = II->getArgOperand(OffsetIdx); + unsigned SingleComponentSizeInBits = + getDataLayout().getTypeSizeInBits(II->getType()->getScalarType()); + unsigned OffsetAdd = + UnusedComponentsAtFront * SingleComponentSizeInBits / 8; + auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); + Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal); + } + } } else { + // Image case. + ConstantInt *DMask = cast(II->getArgOperand(DMaskIdx)); unsigned DMaskVal = DMask->getZExtValue() & 0xf; @@ -1037,7 +1088,7 @@ } if (DMaskVal != NewDMaskVal) - NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal); + Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); } unsigned NewNumElts = DemandedElts.countPopulation(); @@ -1045,8 +1096,8 @@ return UndefValue::get(II->getType()); if (NewNumElts >= VWidth && DemandedElts.isMask()) { - if (NewDMask) - II->setArgOperand(DMaskIdx, NewDMask); + if (DMaskIdx >= 0) + II->setArgOperand(DMaskIdx, Args[DMaskIdx]); return nullptr; } @@ -1069,16 +1120,6 @@ OverloadTys[0] = NewTy; Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys); - SmallVector Args; - for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I) - Args.push_back(II->getArgOperand(I)); - - if (NewDMask) - Args[DMaskIdx] = NewDMask; - - IRBuilderBase::InsertPointGuard Guard(Builder); - Builder.SetInsertPoint(II); - CallInst *NewCall = Builder.CreateCall(NewIntrin, Args); NewCall->takeName(II); NewCall->copyMetadata(*II); @@ -1747,6 +1788,7 @@ case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_s_buffer_load: case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: case Intrinsic::amdgcn_struct_tbuffer_load: diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll @@ -427,9 +427,9 @@ } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 @@ -446,9 +446,9 @@ } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 @@ -456,9 +456,9 @@ } ; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 @@ -466,9 +466,9 @@ } ; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 12 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 @@ -485,9 +485,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -495,9 +495,9 @@ } ; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -514,9 +514,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> @@ -543,9 +543,9 @@ } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 @@ -553,9 +553,9 @@ } ; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 @@ -572,9 +572,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> @@ -620,6 +620,490 @@ declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #1 +; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f16( +; CHECK: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt0_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { + %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %elt0 = extractelement <2 x half> %data, i32 0 + ret half %elt0 +} + +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f16( +; CHECK-NEXT: %1 = add i32 %ofs, 2 +; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt1_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { + %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <2 x half> %data, i32 1 + ret half %elt1 +} + +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f16( +; CHECK-NEXT: %1 = add i32 %ofs, 2 +; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt1_raw_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { + %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %elt0 = extractelement <3 x half> %data, i32 1 + ret half %elt0 +} + +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f16( +; CHECK-NEXT: %1 = add i32 %ofs, 2 +; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { + %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <4 x half> %data, i32 1 + ret half %elt1 +} + +; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f16( +; CHECK-NEXT: %1 = add i32 %ofs, 6 +; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt3_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { + %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <4 x half> %data, i32 3 + ret half %elt1 +} + +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f16( +; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x half> +define amdgpu_ps <2 x half> @extract_elt0_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { + %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %shuf = shufflevector <4 x half> %data, <4 x half> undef, <2 x i32> + ret <2 x half> %shuf +} + +declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32>, i32, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #1 + +; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2i8( +; CHECK: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt0_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { + %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %elt0 = extractelement <2 x i8> %data, i32 0 + ret i8 %elt0 +} + +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2i8( +; CHECK-NEXT: %1 = add i32 %ofs, 1 +; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { + %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <2 x i8> %data, i32 1 + ret i8 %elt1 +} + +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3i8( +; CHECK-NEXT: %1 = add i32 %ofs, 1 +; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { + %data = call <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %elt0 = extractelement <3 x i8> %data, i32 1 + ret i8 %elt0 +} + +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4i8( +; CHECK-NEXT: %1 = add i32 %ofs, 1 +; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { + %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <4 x i8> %data, i32 1 + ret i8 %elt1 +} + +; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4i8( +; CHECK-NEXT: %1 = add i32 %ofs, 3 +; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt3_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { + %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <4 x i8> %data, i32 3 + ret i8 %elt1 +} + +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4i8( +; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x i8> +define amdgpu_ps <2 x i8> @extract_elt0_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { + %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %shuf = shufflevector <4 x i8> %data, <4 x i8> undef, <2 x i32> + ret <2 x i8> %shuf +} + +declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #1 +declare <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32>, i32, i32, i32) #1 +declare <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32>, i32, i32, i32) #1 +declare <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32>, i32, i32, i32) #1 + +; -------------------------------------------------------------------- +; llvm.amdgcn.s.buffer.load +; -------------------------------------------------------------------- + +; CHECK-LABEL: @s_buffer_load_f32( +; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + ret float %data +} + +; CHECK-LABEL: @s_buffer_load_v2f32( +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data +define amdgpu_ps <2 x float> @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + ret <2 x float> %data +} + +; CHECK-LABEL: @s_buffer_load_v4f32( +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: ret <4 x float> %data +define amdgpu_ps <4 x float> @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + ret <4 x float> %data +} + +; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f32( +; CHECK: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt0 = extractelement <2 x float> %data, i32 0 + ret float %elt0 +} + +; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f32( +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @extract_elt1_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <2 x float> %data, i32 1 + ret float %elt1 +} + +; CHECK-LABEL: @extract_elt0_s_buffer_load_v4f32( +; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @extract_elt0_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt0 = extractelement <4 x float> %data, i32 0 + ret float %elt0 +} + +; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f32( +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @extract_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <4 x float> %data, i32 1 + ret float %elt1 +} + +; CHECK-LABEL: @extract_elt2_s_buffer_load_v4f32( +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @extract_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <4 x float> %data, i32 2 + ret float %elt1 +} + +; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f32( +; CHECK-NEXT: %1 = add i32 %ofs, 12 +; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @extract_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <4 x float> %data, i32 3 + ret float %elt1 +} + +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f32( +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: ret <2 x float> +define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> + ret <2 x float> %shuf +} + +; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f32( +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret <2 x float> %data +define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> + ret <2 x float> %shuf +} + +; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f32( +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret <2 x float> %data +define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> + ret <2 x float> %shuf +} + +; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_v4f32( +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: ret <3 x float> %data +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> + ret <3 x float> %shuf +} + +; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32( +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> +; CHECK-NEXT: ret <3 x float> %shuf +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> + ret <3 x float> %shuf +} + +; CHECK-LABEL: @extract_elt0_s_buffer_load_v3f32( +; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @extract_elt0_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt0 = extractelement <3 x float> %data, i32 0 + ret float %elt0 +} + +; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f32( +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @extract_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <3 x float> %data, i32 1 + ret float %elt1 +} + +; CHECK-LABEL: @extract_elt2_s_buffer_load_v3f32( +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret float %data +define amdgpu_ps float @extract_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <3 x float> %data, i32 2 + ret float %elt1 +} + +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v3f32( +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: ret <2 x float> +define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> + ret <2 x float> %shuf +} + +; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v3f32( +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret <2 x float> %data +define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> + ret <2 x float> %shuf +} + +; Do not trim to vec3 s_buffer_load in instcombine, as the load will most likely be widened +; to vec4 anyway during lowering. +; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32( +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> +; CHECK-NEXT: ret <3 x float> %shuf +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> + ret <3 x float> %shuf +} + +; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4f32( +; CHECK-NEXT: %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 +; CHECK-NEXT: ret i32 %tmp2 +define i32 @extract0_bitcast_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %tmp = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %tmp1 = bitcast <4 x float> %tmp to <4 x i32> + %tmp2 = extractelement <4 x i32> %tmp1, i32 0 + ret i32 %tmp2 +} + +; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4i32( +; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float +; CHECK-NEXT: ret float %tmp2 +define float @extract0_bitcast_s_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %tmp = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %tmp1 = bitcast <4 x i32> %tmp to <4 x float> + %tmp2 = extractelement <4 x float> %tmp1, i32 0 + ret float %tmp2 +} + +; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_v2f32( +; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0 +; CHECK-NEXT: ret float %data +define amdgpu_ps float @preserve_metadata_extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0 + %elt0 = extractelement <2 x float> %data, i32 0 + ret float %elt0 +} + +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) #1 + +; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16( +; CHECK: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt0 = extractelement <2 x half> %data, i32 0 + ret half %elt0 +} + +; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16( +; CHECK-NEXT: %1 = add i32 %ofs, 2 +; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <2 x half> %data, i32 1 + ret half %elt1 +} + +; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16( +; CHECK-NEXT: %1 = add i32 %ofs, 2 +; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <3 x half> %data, i32 1 + ret half %elt1 +} + +; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16( +; CHECK-NEXT: %1 = add i32 %ofs, 2 +; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <4 x half> %data, i32 1 + ret half %elt1 +} + + +; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16( +; CHECK-NEXT: %1 = add i32 %ofs, 6 +; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <4 x half> %data, i32 3 + ret half %elt1 +} + +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f16( +; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: ret <2 x half> +define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) + %shuf = shufflevector <4 x half> %data, <4 x half> undef, <2 x i32> + ret <2 x half> %shuf +} + +declare half @llvm.amdgcn.s.buffer.load.f16(<4 x i32>, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32>, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32>, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32>, i32, i32) #1 + +; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8( +; CHECK: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt0 = extractelement <2 x i8> %data, i32 0 + ret i8 %elt0 +} + +; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8( +; CHECK-NEXT: %1 = add i32 %ofs, 1 +; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <2 x i8> %data, i32 1 + ret i8 %elt1 +} + +; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8( +; CHECK-NEXT: %1 = add i32 %ofs, 1 +; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <3 x i8> %data, i32 1 + ret i8 %elt1 +} + +; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8( +; CHECK-NEXT: %1 = add i32 %ofs, 1 +; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <4 x i8> %data, i32 1 + ret i8 %elt1 +} + +; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8( +; CHECK-NEXT: %1 = add i32 %ofs, 3 +; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %elt1 = extractelement <4 x i8> %data, i32 3 + ret i8 %elt1 +} + +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8( +; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0) +; CHECK-NEXT: ret <2 x i8> +define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { + %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) + %shuf = shufflevector <4 x i8> %data, <4 x i8> undef, <2 x i32> + ret <2 x i8> %shuf +} + +declare i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32>, i32, i32) #1 +declare <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32>, i32, i32) #1 +declare <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32>, i32, i32) #1 +declare <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32>, i32, i32) #1 + ; -------------------------------------------------------------------- ; llvm.amdgcn.raw.buffer.load.format ; -------------------------------------------------------------------- @@ -901,9 +1385,9 @@ } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 @@ -920,9 +1404,9 @@ } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 @@ -930,9 +1414,9 @@ } ; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 @@ -940,9 +1424,9 @@ } ; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 12 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 @@ -959,9 +1443,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -969,9 +1453,9 @@ } ; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -988,9 +1472,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> @@ -1017,9 +1501,9 @@ } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 @@ -1027,9 +1511,9 @@ } ; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 @@ -1046,9 +1530,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> @@ -1094,6 +1578,132 @@ declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1 +; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f16( +; CHECK: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt0_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { + %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %elt0 = extractelement <2 x half> %data, i32 0 + ret half %elt0 +} + +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f16( +; CHECK-NEXT: %1 = add i32 %ofs, 2 +; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt1_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { + %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <2 x half> %data, i32 1 + ret half %elt1 +} + +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f16( +; CHECK-NEXT: %1 = add i32 %ofs, 2 +; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt1_struct_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { + %data = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <3 x half> %data, i32 1 + ret half %elt1 +} + +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f16( +; CHECK-NEXT: %1 = add i32 %ofs, 2 +; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { + %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <4 x half> %data, i32 1 + ret half %elt1 +} + +; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f16( +; CHECK-NEXT: %1 = add i32 %ofs, 6 +; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt3_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { + %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <4 x half> %data, i32 3 + ret half %elt1 +} + +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f16( +; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x half> +define amdgpu_ps <2 x half> @extract_elt0_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { + %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %shuf = shufflevector <4 x half> %data, <4 x half> undef, <2 x i32> + ret <2 x half> %shuf +} + +declare half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32>, i32, i32, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1 + +; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2i8( +; CHECK: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt0_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { + %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %elt0 = extractelement <2 x i8> %data, i32 0 + ret i8 %elt0 +} + +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2i8( +; CHECK-NEXT: %1 = add i32 %ofs, 1 +; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { + %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <2 x i8> %data, i32 1 + ret i8 %elt1 +} + +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3i8( +; CHECK-NEXT: %1 = add i32 %ofs, 1 +; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { + %data = call <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <3 x i8> %data, i32 1 + ret i8 %elt1 +} + +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4i8( +; CHECK-NEXT: %1 = add i32 %ofs, 1 +; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { + %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <4 x i8> %data, i32 1 + ret i8 %elt1 +} + +; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4i8( +; CHECK-NEXT: %1 = add i32 %ofs, 3 +; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret i8 %data +define amdgpu_ps i8 @extract_elt3_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { + %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %elt1 = extractelement <4 x i8> %data, i32 3 + ret i8 %elt1 +} + +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4i8( +; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x i8> +define amdgpu_ps <2 x i8> @extract_elt0_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { + %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %shuf = shufflevector <4 x i8> %data, <4 x i8> undef, <2 x i32> + ret <2 x i8> %shuf +} + +declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) #1 +declare <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32>, i32, i32, i32, i32) #1 +declare <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32>, i32, i32, i32, i32) #1 +declare <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32>, i32, i32, i32, i32) #1 + ; -------------------------------------------------------------------- ; llvm.amdgcn.struct.buffer.load.format ; --------------------------------------------------------------------