Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -51,6 +51,9 @@ addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); @@ -155,13 +158,30 @@ for (MVT VT : MVT::fp_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + + setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::LOAD, MVT::v2i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::STORE, MVT::v2i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); @@ -173,9 +193,14 @@ setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); + + + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch(Op) { case ISD::LOAD: @@ -186,6 +211,7 @@ case ISD::INSERT_VECTOR_ELT: case ISD::INSERT_SUBVECTOR: case ISD::EXTRACT_SUBVECTOR: + case ISD::SCALAR_TO_VECTOR: break; case ISD::CONCAT_VECTORS: setOperationAction(Op, VT, Custom); @@ -197,6 +223,22 @@ } } + // Most operations are naturally 32-bit vector operations. We only support + // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. + for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); + } + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -2504,6 +2504,11 @@ /********** Extraction, Insertion, Building and Casting **********/ /********** ============================================ **********/ +//def : Extract_Element; +//def : Extract_Element; +//def : Extract_Element; +//def : Extract_Element; + foreach Index = 0-2 in { def Extract_Element_v2i32_#Index : Extract_Element < i32, v2i32, Index, !cast(sub#Index) @@ -2589,6 +2594,16 @@ def : BitConvert ; def : BitConvert ; + +def : BitConvert ; +def : BitConvert ; + +def : BitConvert ; +def : BitConvert ; + + + + def : BitConvert ; def : BitConvert ; def : BitConvert ; Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -193,7 +193,7 @@ (add SGPR_64, VCC, EXEC, FLAT_SCR) >; -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)> { +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 128, (add SGPR_128)> { // Requires 2 s_mov_b64 to copy let CopyCost = 2; } @@ -221,7 +221,7 @@ let CopyCost = 3; } -def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)> { +def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 128, (add VGPR_128)> { // Requires 4 v_mov_b32 to copy let CopyCost = 4; } Index: test/CodeGen/AMDGPU/ds_read2_superreg.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -61,15 +61,11 @@ ret void } - -; FIXME: the v_lshl_b64 x, x, 32 is a bad way of doing a copy - ; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}} -; CI: v_lshr_b64 v{{\[}}[[Y_COPY:[0-9]+]]:{{[0-9]+\]}}, v{{\[}}[[REG_X]]:[[REG_Y]]{{\]}}, 32 ; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[Y_COPY]], v[[ADD0]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD1]] ; CI: s_endpgm define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { Index: test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2.ll +++ test/CodeGen/AMDGPU/ds_write2.ll @@ -345,8 +345,9 @@ ; SI-LABEL: @store_misaligned64_constant_offsets ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; SI: s_endpgm define void @store_misaligned64_constant_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 Index: test/CodeGen/AMDGPU/extract-vector-elt-i64.ll =================================================================== --- test/CodeGen/AMDGPU/extract-vector-elt-i64.ll +++ test/CodeGen/AMDGPU/extract-vector-elt-i64.ll @@ -17,3 +17,27 @@ store volatile i64 %val, i64 addrspace(1)* %in ret void } + + +define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) nounwind { + %p0 = extractelement <2 x i64> %foo, i32 0 + %p1 = extractelement <2 x i64> %foo, i32 1 + %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1 + store volatile i64 %p1, i64 addrspace(1)* %out + store volatile i64 %p0, i64 addrspace(1)* %out1 + ret void +} + +define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) nounwind { + %dynelt = extractelement <2 x i64> %foo, i32 %elt + store volatile i64 %dynelt, i64 addrspace(1)* %out + ret void +} + +define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) nounwind { + %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo + %or = or <2 x i64> %load, %arst + %dynelt = extractelement <2 x i64> %or, i32 %elt + store volatile i64 %dynelt, i64 addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/global-extload-i32.ll =================================================================== --- test/CodeGen/AMDGPU/global-extload-i32.ll +++ test/CodeGen/AMDGPU/global-extload-i32.ll @@ -49,8 +49,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64: ; SI: buffer_load_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i32>, <2 x i32> addrspace(1)* %in @@ -63,8 +62,7 @@ ; SI: buffer_load_dwordx2 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i32>, <2 x i32> addrspace(1)* %in @@ -75,10 +73,8 @@ ; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64: ; SI: buffer_load_dwordx4 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i32>, <4 x i32> addrspace(1)* %in @@ -93,10 +89,8 @@ ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i32>, <4 x i32> addrspace(1)* %in @@ -108,14 +102,10 @@ ; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64: ; SI: buffer_load_dwordx4 ; SI: buffer_load_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i32>, <8 x i32> addrspace(1)* %in @@ -136,15 +126,10 @@ ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i32>, <8 x i32> addrspace(1)* %in @@ -163,29 +148,25 @@ ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i32>, <16 x i32> addrspace(1)* %in @@ -200,23 +181,14 @@ ; SI: buffer_load_dwordx4 ; SI: buffer_load_dwordx4 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 - +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i32>, <16 x i32> addrspace(1)* %in @@ -269,41 +241,25 @@ ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { @@ -323,41 +279,25 @@ ; SI: buffer_load_dwordx4 ; SI: buffer_load_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 + +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 + +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 + +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -382,10 +382,9 @@ ; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]] ; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]] -; GCN-DAG: v_cvt_f64_f32_e32 [[CVT2:v\[[0-9]+:[0-9]+\]]], v[[CVT0]] -; GCN-DAG: v_cvt_f64_f32_e32 [[CVT3:v\[[0-9]+:[0-9]+\]]], v[[CVT1]] -; GCN-DAG: buffer_store_dwordx2 [[CVT2]] -; GCN-DAG: buffer_store_dwordx2 [[CVT3]] +; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] +; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} ; GCN: s_endpgm define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in @@ -395,6 +394,25 @@ } ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: + +; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]], 32 +; VI: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 32, [[LOAD]] +; GCN: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} + +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN-NOT: v_cvt_f32_f16_e32 + +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN-NOT: v_cvt_f64_f32_e32 + +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 +; GCN: s_endpgm define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { %val = load <3 x half>, <3 x half> addrspace(1)* %in %cvt = fpext <3 x half> %val to <3 x double> Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -70,8 +70,9 @@ } ; SI-LABEL: {{^}}dynamic_insertelement_v8f32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 +; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 @@ -79,10 +80,11 @@ } ; SI-LABEL: {{^}}dynamic_insertelement_v16f32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 +; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 @@ -202,10 +204,28 @@ } ; SI-LABEL: {{^}}dynamic_insertelement_v2f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}} +; SI-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}} +; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}} + +; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} + +; SI: s_mov_b32 m0, [[SCALEDIDX]] +; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]] + +; Increment to next element. +; FIXME: Should be able to manipulate m0 directly instead of add and +; copy. + +; SI: s_or_b32 [[IDX1:s[0-9]+]], [[SCALEDIDX]], 1 +; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000 +; SI-DAG: s_mov_b32 m0, [[IDX1]] +; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]] + +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind { %vecins = insertelement <2 x double> %a, double 8.0, i32 %b @@ -213,9 +233,16 @@ ret void } +; FIXME: Inline immediate should be folded into v_movreld_b32. ; SI-LABEL: {{^}}dynamic_insertelement_v2i64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 + +; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 5{{$}} +; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0{{$}} + +; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]] +; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]] + +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { %vecins = insertelement <2 x i64> %a, i64 5, i32 %b @@ -223,12 +250,27 @@ ret void } +; FIXME: Should be able to do without stack access ; SI-LABEL: {{^}}dynamic_insertelement_v4f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: SCRATCH_RSRC_DWORD + +; Stack store +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} + +; Write element +; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} + +; Stack reload +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} + +; Store result +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm +; SI: ScratchSize: 32 + define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { %vecins = insertelement <4 x double> %a, double 8.0, i32 %b store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 @@ -236,15 +278,26 @@ } ; SI-LABEL: {{^}}dynamic_insertelement_v8f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: SCRATCH_RSRC_DWORD + +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:32{{$}} +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:48{{$}} + +; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} + +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} + +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm +; SI: ScratchSize: 64 define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind { %vecins = insertelement <8 x double> %a, double 8.0, i32 %b store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 Index: test/CodeGen/AMDGPU/merge-stores.ll =================================================================== --- test/CodeGen/AMDGPU/merge-stores.ll +++ test/CodeGen/AMDGPU/merge-stores.ll @@ -191,9 +191,7 @@ } ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: -; XGCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx4 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 @@ -203,13 +201,8 @@ } ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: -; XGCN: buffer_store_dwordx4 -; XGCN: buffer_store_dwordx4 - -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 Index: test/CodeGen/AMDGPU/reorder-stores.ll =================================================================== --- test/CodeGen/AMDGPU/reorder-stores.ll +++ test/CodeGen/AMDGPU/reorder-stores.ll @@ -2,14 +2,10 @@ ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store: -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind { %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16