Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -8025,7 +8025,7 @@ switch(Opc) { default: - return SDValue(); + break; // TODO: Support other binary operations. case ISD::FADD: case ISD::FSUB: @@ -8051,12 +8051,34 @@ } } - if (!DCI.isBeforeLegalize()) - return SDValue(); - unsigned VecSize = VecVT.getSizeInBits(); unsigned EltSize = EltVT.getSizeInBits(); + // EXTRACT_VECTOR_ELT (, var-idx) => n x select (e, const-idx) + // This elminates non-constant index and subsequent movrel or scratch access. + // Sub-dword vectors of size 2 dword or less have better implementation. + // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 + // instructions. + if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) && + !isa(N->getOperand(1))) { + SDLoc SL(N); + SDValue Idx = N->getOperand(1); + EVT IdxVT = Idx.getValueType(); + SDValue V; + for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { + SDValue IC = DAG.getConstant(I, SL, IdxVT); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC); + if (I == 0) + V = Elt; + else + V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ); + } + return V; + } + + if (!DCI.isBeforeLegalize()) + return SDValue(); + // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit // elements. This exposes more load reduction opportunities by replacing // multiple small extract_vector_elements with a single 32-bit extract. Index: test/CodeGen/AMDGPU/extract_vector_dynelt.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -0,0 +1,287 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s + +; GCN-LABEL: {{^}}float4_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2.0, [[V1]], [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4.0, [[V2]], [[C3]] +; GCN: store_dword v[{{[0-9:]+}}], [[V3]] +define amdgpu_kernel void @float4_extelt(float addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <4 x float> , i32 %sel + store float %ext, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}int4_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2, [[V1]], [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4, [[V2]], [[C3]] +; GCN: store_dword v[{{[0-9:]+}}], [[V3]] +define amdgpu_kernel void @int4_extelt(i32 addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <4 x i32> , i32 %sel + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}double4_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] +define amdgpu_kernel void @double4_extelt(double addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <4 x double> , i32 %sel + store double %ext, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}half4_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00 +; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200 +; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 4 +; GCN: s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]] +; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]] +; GCN: store_short v[{{[0-9:]+}}], v[[VRL]] +define amdgpu_kernel void @half4_extelt(half addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <4 x half> , i32 %sel + store half %ext, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}float2_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]] +; GCN: store_dword v[{{[0-9:]+}}], [[V1]] +define amdgpu_kernel void @float2_extelt(float addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <2 x float> , i32 %sel + store float %ext, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}double2_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] +define amdgpu_kernel void @double2_extelt(double addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <2 x double> , i32 %sel + store double %ext, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}half8_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4 +; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5 +; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6 +; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]] +; GCN: store_short v[{{[0-9:]+}}], [[V7]] +define amdgpu_kernel void @half8_extelt(half addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <8 x half> , i32 %sel + store half %ext, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}short8_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4 +; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5 +; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6 +; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]] +; GCN: store_short v[{{[0-9:]+}}], [[V7]] +define amdgpu_kernel void @short8_extelt(i16 addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <8 x i16> , i32 %sel + store i16 %ext, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}float8_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4 +; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5 +; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6 +; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]] +; GCN: store_dword v[{{[0-9:]+}}], [[V7]] +define amdgpu_kernel void @float8_extelt(float addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <8 x float> , i32 %sel + store float %ext, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}float16_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: s_mov_b32 m0, +; GCN-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41100000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41200000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41300000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41400000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41500000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41600000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41700000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41800000 +; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]] +; GCN: store_dword v[{{[0-9:]+}}], [[RES]] +define amdgpu_kernel void @float16_extelt(float addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <16 x float> , i32 %sel + store float %ext, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}double16_extelt: +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: store_dword +define amdgpu_kernel void @double16_extelt(double addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <16 x double> , i32 %sel + store double %ext, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}byte8_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x4030201 +; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x8070605 +; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 3 +; GCN: s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]] +; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]] +; GCN: store_byte v[{{[0-9:]+}}], v[[VRL]] +define amdgpu_kernel void @byte8_extelt(i8 addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <8 x i8> , i32 %sel + store i8 %ext, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}byte16_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4 +; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5 +; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6 +; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7 +; GCN-DAG: v_cmp_ne_u32_e64 [[C8:[^,]+]], [[IDX]], 8 +; GCN-DAG: v_cmp_ne_u32_e64 [[C9:[^,]+]], [[IDX]], 9 +; GCN-DAG: v_cmp_ne_u32_e64 [[C10:[^,]+]], [[IDX]], 10 +; GCN-DAG: v_cmp_ne_u32_e64 [[C11:[^,]+]], [[IDX]], 11 +; GCN-DAG: v_cmp_ne_u32_e64 [[C12:[^,]+]], [[IDX]], 12 +; GCN-DAG: v_cmp_ne_u32_e64 [[C13:[^,]+]], [[IDX]], 13 +; GCN-DAG: v_cmp_ne_u32_e64 [[C14:[^,]+]], [[IDX]], 14 +; GCN-DAG: v_cmp_ne_u32_e64 [[C15:[^,]+]], [[IDX]], 15 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V8:v[0-9]+]], {{[^,]+}}, [[V7]], [[C8]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V9:v[0-9]+]], {{[^,]+}}, [[V8]], [[C8]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V10:v[0-9]+]], {{[^,]+}}, [[V9]], [[C10]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V11:v[0-9]+]], {{[^,]+}}, [[V10]], [[C11]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V12:v[0-9]+]], {{[^,]+}}, [[V11]], [[C12]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V13:v[0-9]+]], {{[^,]+}}, [[V12]], [[C13]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V14:v[0-9]+]], {{[^,]+}}, [[V13]], [[C14]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V15:v[0-9]+]], {{[^,]+}}, [[V14]], [[C15]] +; GCN: store_byte v[{{[0-9:]+}}], [[V15]] +define amdgpu_kernel void @byte16_extelt(i8 addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <16 x i8> , i32 %sel + store i8 %ext, i8 addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/extract_vector_elt-f64.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-f64.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-f64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; GCN-LABEL: {{^}}extract_vector_elt_v3f64_2: ; GCN: buffer_load_dwordx4 @@ -13,6 +13,14 @@ } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64: +; GCN-NOT: buffer_load +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 { %dynelt = extractelement <3 x double> %foo, i32 %elt store volatile double %dynelt, double addrspace(1)* %out @@ -20,6 +28,17 @@ } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64: +; GCN-NOT: buffer_load +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 { %dynelt = extractelement <4 x double> %foo, i32 %elt store volatile double %dynelt, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/extract_vector_elt-i64.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i64.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i64.ll @@ -30,6 +30,11 @@ } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64: +; GCN-NOT: buffer_load +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <2 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out @@ -37,6 +42,12 @@ } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2: +; GCN: buffer_load_dwordx4 +; GCN-NOT: buffer_load +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 { %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo %or = or <2 x i64> %load, %arst @@ -46,6 +57,14 @@ } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64: +; GCN-NOT: buffer_load +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <3 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out @@ -53,6 +72,17 @@ } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64: +; GCN-NOT: buffer_load +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <4 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out Index: test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll @@ -26,15 +26,17 @@ ; CHECK: s_cbranch_scc1 [[BB4:BB[0-9]+_[0-9]+]] ; CHECK: buffer_load_dwordx4 -; CHECK: s_mov_b32 m0, -; CHECK: v_movrels_b32_e32 +; CHECK: v_cndmask_b32_e64 +; CHECK: v_cndmask_b32_e64 +; CHECK: v_cndmask_b32_e64 ; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]] ; CHECK: [[BB4]]: ; CHECK: buffer_load_dwordx4 -; CHECK: s_mov_b32 m0, -; CHECK: v_movrels_b32_e32 +; CHECK: v_cndmask_b32_e64 +; CHECK: v_cndmask_b32_e64 +; CHECK: v_cndmask_b32_e64 ; CHECK: [[ENDBB]]: ; CHECK: buffer_store_dword Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -22,7 +22,7 @@ define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) { entry: %idx = add i32 %in, 1 - %elt = extractelement <4 x float> , i32 %idx + %elt = extractelement <16 x float> , i32 %idx store float %elt, float addrspace(1)* %out ret void } @@ -44,11 +44,11 @@ ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off -define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) { +define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <16 x i32> %or.val) { entry: %idx = add i32 %in, 1 - %vec = or <4 x i32> %or.val, - %elt = extractelement <4 x i32> %vec, i32 %idx + %vec = or <16 x i32> %or.val, + %elt = extractelement <16 x i32> %vec, i32 %idx store i32 %elt, i32 addrspace(1)* %out ret void } @@ -68,7 +68,7 @@ ; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) { entry: - %elt = extractelement <4 x float> , i32 %in + %elt = extractelement <16 x float> , i32 %in store float %elt, float addrspace(1)* %out ret void } @@ -79,15 +79,15 @@ ; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} -; IDXMODE: v_mov_b32_e32 v2, 2 -; IDXMODE: v_mov_b32_e32 v3, 3 +; IDXMODE: v_mov_b32_e32 v14, 15 +; IDXMODE: v_mov_b32_e32 v15, 16 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) { entry: %index = add i32 %offset, -512 - %value = extractelement <4 x i32> , i32 %index + %value = extractelement <16 x i32> , i32 %index store i32 %value, i32 addrspace(1)* %out ret void } @@ -102,14 +102,26 @@ ; IDXMODE: v_mov_b32_e32 v1, ; IDXMODE: v_mov_b32_e32 v2, ; IDXMODE: v_mov_b32_e32 v3, +; IDXMODE: v_mov_b32_e32 v4, +; IDXMODE: v_mov_b32_e32 v5, +; IDXMODE: v_mov_b32_e32 v6, +; IDXMODE: v_mov_b32_e32 v7, +; IDXMODE: v_mov_b32_e32 v8, +; IDXMODE: v_mov_b32_e32 v9, +; IDXMODE: v_mov_b32_e32 v10, +; IDXMODE: v_mov_b32_e32 v11, +; IDXMODE: v_mov_b32_e32 v12, +; IDXMODE: v_mov_b32_e32 v13, +; IDXMODE: v_mov_b32_e32 v14, +; IDXMODE: v_mov_b32_e32 v15, ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off -define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) { +define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) { entry: %index = add i32 %offset, -512 - %or = or <4 x i32> %vec0, %vec1 - %value = extractelement <4 x i32> %or, i32 %index + %or = or <16 x i32> %vec0, %vec1 + %value = extractelement <16 x i32> %or, i32 %index store i32 %value, i32 addrspace(1)* %out ret void } @@ -138,7 +150,7 @@ entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -512 - %value = extractelement <4 x i32> , i32 %index + %value = extractelement <16 x i32> , i32 %index store i32 %value, i32 addrspace(1)* %out ret void } @@ -364,9 +376,9 @@ %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext %idx0 = load volatile i32, i32 addrspace(1)* %gep %idx1 = add i32 %idx0, 1 - %val0 = extractelement <4 x i32> , i32 %idx0 + %val0 = extractelement <16 x i32> , i32 %idx0 %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" () - %val1 = extractelement <4 x i32> , i32 %idx1 + %val1 = extractelement <16 x i32> , i32 %idx1 store volatile i32 %val0, i32 addrspace(1)* %out0 store volatile i32 %val1, i32 addrspace(1)* %out0 %cmp = icmp eq i32 %id, 0 @@ -522,7 +534,7 @@ ; offset puts outside of superegister bounaries, so clamp to 1st element. ; GCN-LABEL: {{^}}extract_largest_inbounds_offset: -; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}} +; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\].* offset:48}} ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]] ; MOVREL: s_mov_b32 m0, [[IDX]] ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]] @@ -532,11 +544,11 @@ ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword [[EXTRACT]] -define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { +define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) { entry: - %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in - %offset = add i32 %idx, 3 - %value = extractelement <4 x i32> %ld, i32 %offset + %ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in + %offset = add i32 %idx, 15 + %value = extractelement <16 x i32> %ld, i32 %offset store i32 %value, i32 addrspace(1)* %out ret void } @@ -544,20 +556,20 @@ ; GCN-LABEL: {{^}}extract_out_of_bounds_offset: ; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}} ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]] -; MOVREL: s_add_i32 m0, [[IDX]], 4 +; MOVREL: s_add_i32 m0, [[IDX]], 16 ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] -; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 4 +; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0 ; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword [[EXTRACT]] -define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { +define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) { entry: - %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in - %offset = add i32 %idx, 4 - %value = extractelement <4 x i32> %ld, i32 %offset + %ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in + %offset = add i32 %idx, 16 + %value = extractelement <16 x i32> %ld, i32 %offset store i32 %value, i32 addrspace(1)* %out ret void } @@ -565,7 +577,7 @@ ; Test that the or is folded into the base address register instead of ; added to m0 -; GCN-LABEL: {{^}}extractelement_v4i32_or_index: +; GCN-LABEL: {{^}}extractelement_v16i32_or_index: ; GCN: s_load_dword [[IDX_IN:s[0-9]+]] ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] ; GCN-NOT: [[IDX_SHL]] @@ -576,12 +588,12 @@ ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off -define amdgpu_kernel void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) { +define amdgpu_kernel void @extractelement_v16i32_or_index(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx.in) { entry: - %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in + %ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in %idx.shl = shl i32 %idx.in, 2 %idx = or i32 %idx.shl, 1 - %value = extractelement <4 x i32> %ld, i32 %idx + %value = extractelement <16 x i32> %ld, i32 %idx store i32 %value, i32 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll +++ test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll @@ -6,10 +6,10 @@ ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4 ; GFX-NOT: buffer_ -; GCN: v_readfirstlane_b32 -; GFX8: v_movrels_b32 -; GFX9: s_set_gpr_idx_on -; GFX9: s_set_gpr_idx_off +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0, +; GCN: store_dword v[{{[0-9:]+}}], [[RES]] ; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2 ; OPT: store <4 x float> , <4 x float> addrspace(5)* %alloca, align 4 Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -332,7 +332,7 @@ ; GCN-LABEL: {{^}}smrd_imm_merge_m0: ; -; SICIVI: s_buffer_load_dwordx2 +; GCN: s_buffer_load_dwordx2 ; SICIVI: s_mov_b32 m0 ; SICIVI_DAG: v_interp_p1_f32 ; SICIVI_DAG: v_interp_p1_f32 @@ -340,13 +340,15 @@ ; SICIVI_DAG: v_interp_p2_f32 ; SICIVI_DAG: v_interp_p2_f32 ; SICIVI_DAG: v_interp_p2_f32 -; SICIVI: s_mov_b32 m0 -; SICIVI: v_movrels_b32_e32 +; +; extractelement does not result in movrels anymore for vectors gitting 8 dwords +; SICIVI-NOT: s_mov_b32 m0 +; SICIVI-NOT: v_movrels_b32_e32 +; v_cndmask_b32_e32 +; v_cndmask_b32_e32 ; ; Merging is still thwarted on GFX9 due to s_set_gpr_idx ; -; GFX9: s_buffer_load_dword -; GFX9: s_buffer_load_dword define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 { main_body: %idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0) Index: test/CodeGen/AMDGPU/trunc-bitcast-vector.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-bitcast-vector.ll +++ test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -51,8 +51,7 @@ ; t21: v2i32,ch = load t12, t10, undef:i64 ; t23: i64 = bitcast t21 ; t30: i16 = truncate t23 -; SI: buffer_load_dword v[[VAL:[0-9]+]] -; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]] +; GCN: buffer_load_dword v[[VAL:[0-9]+]] ; GCN: buffer_store_short v[[VAL]], off define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in Index: test/CodeGen/AMDGPU/vector-extract-insert.ll =================================================================== --- test/CodeGen/AMDGPU/vector-extract-insert.ll +++ test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -28,7 +28,9 @@ ; GCN-LABEL: {{^}}extract_insert_different_dynelt_v4i32: ; GCN: buffer_load_dwordx4 ; GCN: v_movreld_b32 -; GCN: v_movrels_b32 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 ; GCN: buffer_store_dword v define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x()