diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -982,13 +982,68 @@ if (VWidth == 1) return nullptr; - ConstantInt *NewDMask = nullptr; + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(II); + + // Assume the arguments are unchanged and later override them, if needed. + SmallVector Args; + for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I) { + Args.push_back(II->getArgOperand(I)); + } if (DMaskIdx < 0) { - // Pretend that a prefix of elements is demanded to simplify the code - // below. - DemandedElts = (1 << DemandedElts.getActiveBits()) - 1; + // Buffer case. + + const unsigned ActiveBits = DemandedElts.getActiveBits(); + const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); + + // Start assuming the prefix of elements is demanded, but possibly clear some other bits if + // there are trailing zeros (unused components at front) and update offset. + DemandedElts = (1 << ActiveBits) - 1; + + if (UnusedComponentsAtFront > 0) { + static const unsigned InvalidOffset = 0xf; + + unsigned OffsetIdx; + switch (II->getIntrinsicID()) { + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_buffer_load_format: + OffsetIdx = 1; + break; + case Intrinsic::amdgcn_s_buffer_load: + // If resulting type would be vec3, there is no point in trimming the + // load with updated offset, as the vec3 would most likely be widened to + // vec4 anyway during lowering. + if (ActiveBits == 4 && UnusedComponentsAtFront == 1) + OffsetIdx = InvalidOffset; + else + OffsetIdx = 1; + break; + case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_format: + case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_buffer_load_format: + OffsetIdx = 2; + break; + default: + // TODO: handle *tbuffer* intrinsics. + OffsetIdx = InvalidOffset; + break; + } + + if (OffsetIdx != InvalidOffset) { + // Clear demanded bits and update the offset. + DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); + auto Offset = II->getArgOperand(OffsetIdx); + unsigned OffsetAdd = + UnusedComponentsAtFront * II->getType()->getScalarSizeInBits() / 8; + auto OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); + Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal); + } + } } else { + // Image case. + ConstantInt *DMask = cast(II->getArgOperand(DMaskIdx)); unsigned DMaskVal = DMask->getZExtValue() & 0xf; @@ -1007,7 +1062,7 @@ } if (DMaskVal != NewDMaskVal) - NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal); + Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); } unsigned NewNumElts = DemandedElts.countPopulation(); @@ -1015,8 +1070,8 @@ return UndefValue::get(II->getType()); if (NewNumElts >= VWidth && DemandedElts.isMask()) { - if (NewDMask) - II->setArgOperand(DMaskIdx, NewDMask); + if (DMaskIdx >= 0) + II->setArgOperand(DMaskIdx, Args[DMaskIdx]); return nullptr; } @@ -1039,16 +1094,6 @@ OverloadTys[0] = NewTy; Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys); - SmallVector Args; - for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I) - Args.push_back(II->getArgOperand(I)); - - if (NewDMask) - Args[DMaskIdx] = NewDMask; - - IRBuilderBase::InsertPointGuard Guard(Builder); - Builder.SetInsertPoint(II); - CallInst *NewCall = Builder.CreateCall(NewIntrin, Args); NewCall->takeName(II); NewCall->copyMetadata(*II); @@ -1690,6 +1735,7 @@ case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_s_buffer_load: case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: case Intrinsic::amdgcn_struct_tbuffer_load: diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll @@ -46,9 +46,9 @@ } ; CHECK-LABEL: @extract_elt1_buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <2 x float> %data, i32 1 @@ -65,9 +65,9 @@ } ; CHECK-LABEL: @extract_elt1_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 1 @@ -75,9 +75,9 @@ } ; CHECK-LABEL: @extract_elt2_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 2 @@ -85,9 +85,9 @@ } ; CHECK-LABEL: @extract_elt3_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 12 +; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 3 @@ -104,9 +104,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -114,9 +114,9 @@ } ; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -133,9 +133,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) +; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> @@ -246,9 +246,9 @@ } ; CHECK-LABEL: @extract_elt1_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 1 @@ -256,9 +256,9 @@ } ; CHECK-LABEL: @extract_elt2_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 2 @@ -275,9 +275,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> @@ -427,9 +427,9 @@ } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 @@ -446,9 +446,9 @@ } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 @@ -456,9 +456,9 @@ } ; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 @@ -466,9 +466,9 @@ } ; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 12 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 @@ -485,9 +485,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -495,9 +495,9 @@ } ; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -514,9 +514,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> @@ -543,9 +543,9 @@ } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 @@ -553,9 +553,9 @@ } ; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 @@ -572,9 +572,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> @@ -666,9 +666,9 @@ } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 @@ -685,9 +685,9 @@ } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 @@ -695,9 +695,9 @@ } ; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 @@ -705,9 +705,9 @@ } ; CHECK-LABEL: @extract_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 12 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 @@ -724,9 +724,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -734,9 +734,9 @@ } ; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -753,9 +753,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> @@ -782,9 +782,9 @@ } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 @@ -792,9 +792,9 @@ } ; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 @@ -811,9 +811,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> @@ -901,9 +901,9 @@ } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 @@ -920,9 +920,9 @@ } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 @@ -930,9 +930,9 @@ } ; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 @@ -940,9 +940,9 @@ } ; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 12 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 @@ -959,9 +959,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -969,9 +969,9 @@ } ; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -988,9 +988,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> @@ -1017,9 +1017,9 @@ } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 @@ -1027,9 +1027,9 @@ } ; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 @@ -1046,9 +1046,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> @@ -1140,9 +1140,9 @@ } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 @@ -1159,9 +1159,9 @@ } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 @@ -1169,9 +1169,9 @@ } ; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 @@ -1179,9 +1179,9 @@ } ; CHECK-LABEL: @extract_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 12 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 @@ -1198,9 +1198,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -1208,9 +1208,9 @@ } ; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -1227,9 +1227,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> @@ -1256,9 +1256,9 @@ } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 @@ -1266,9 +1266,9 @@ } ; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 +; CHECK-NEXT: %1 = add i32 %ofs, 8 +; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 @@ -1285,9 +1285,9 @@ } ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf +; CHECK-NEXT: %1 = add i32 %ofs, 4 +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) +; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32>