Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14659,20 +14659,26 @@ if (DAG.getDataLayout().isBigEndian()) return SDValue(); - // TODO: The one-use check is overly conservative. Check the cost of the - // extract instead or remove that condition entirely. auto *Ld = dyn_cast(Extract->getOperand(0)); auto *ExtIdx = dyn_cast(Extract->getOperand(1)); - if (!Ld || !Ld->hasOneUse() || Ld->getExtensionType() || Ld->isVolatile() || - !ExtIdx) + if (!Ld || Ld->getExtensionType() || Ld->isVolatile() || !ExtIdx) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned ExtIdxValue = ExtIdx->getZExtValue(); + EVT VT = Extract->getValueType(0); + + // TODO: It would be better to ask if the extract is free, rather than cheap, + // or just eliminate this check entirely. Using a narrower load directly + // reduces the dependency chain and may reduce register pressure. + if (!Ld->hasOneUse() && TLI.isExtractSubvectorCheap(VT, ExtIdxValue)) return SDValue(); // The narrow load will be offset from the base address of the old load if // we are extracting from something besides index 0 (little-endian). - EVT VT = Extract->getValueType(0); SDLoc DL(Extract); SDValue BaseAddr = Ld->getOperand(1); - unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize(); + unsigned Offset = ExtIdxValue * VT.getScalarType().getStoreSize(); // TODO: Use "BaseIndexOffset" to make this more effective. SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); Index: test/CodeGen/AArch64/arm64-vabs.ll =================================================================== --- test/CodeGen/AArch64/arm64-vabs.ll +++ test/CodeGen/AArch64/arm64-vabs.ll @@ -138,7 +138,7 @@ define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) { ; CHECK-LABEL: uabdl8h_rdx -; CHECK: uabdl2.8h +; CHECK: uabdl.8h ; CHECK: uabdl.8h %aload = load <16 x i8>, <16 x i8>* %a, align 1 %bload = load <16 x i8>, <16 x i8>* %b, align 1 @@ -156,7 +156,7 @@ define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) { ; CHECK-LABEL: uabdl4s_rdx -; CHECK: uabdl2.4s +; CHECK: uabdl.4s ; CHECK: uabdl.4s %aload = load <8 x i16>, <8 x i16>* %a, align 1 %bload = load <8 x i16>, <8 x i16>* %b, align 1 @@ -174,7 +174,7 @@ define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { ; CHECK: uabdl2d_rdx -; CHECK: uabdl2.2d +; CHECK: uabdl.2d ; CHECK: uabdl.2d %aload = load <4 x i32>, <4 x i32>* %a, align 1 %bload = load <4 x i32>, <4 x i32>* %b, align 1 Index: test/CodeGen/AArch64/merge-store.ll =================================================================== --- test/CodeGen/AArch64/merge-store.ll +++ test/CodeGen/AArch64/merge-store.ll @@ -4,7 +4,7 @@ @g0 = external global <3 x float>, align 16 @g1 = external global <3 x float>, align 4 -; CHECK: ldr q[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]], :lo12:g0 +; CHECK: ldr d[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]], :lo12:g0 ; CHECK: str d[[R0]] define void @blam() { Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -502,7 +502,8 @@ } ; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: -; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx2 +; GCN: buffer_load_dwordx2 ; GCN-DAG: v_cvt_f16_f32_e32 ; SI-DAG: v_cvt_f16_f32_e32 ; SI-DAG: v_cvt_f16_f32_e32 @@ -519,8 +520,10 @@ } ; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx2 +; GCN: buffer_load_dwordx2 +; GCN: buffer_load_dwordx2 +; GCN: buffer_load_dwordx2 ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 @@ -547,10 +550,14 @@ } ; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx2 +; GCN: buffer_load_dwordx2 +; GCN: buffer_load_dwordx2 +; GCN: buffer_load_dwordx2 +; GCN: buffer_load_dwordx2 +; GCN: buffer_load_dwordx2 +; GCN: buffer_load_dwordx2 +; GCN: buffer_load_dwordx2 ; GCN-DAG: v_cvt_f16_f32_e32 ; GCN-DAG: v_cvt_f16_f32_e32 ; GCN-DAG: v_cvt_f16_f32_e32 Index: test/CodeGen/AMDGPU/load-constant-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i32.ll +++ test/CodeGen/AMDGPU/load-constant-i32.ll @@ -28,7 +28,8 @@ ; FUNC-LABEL: {{^}}constant_load_v3i32: ; GCN: s_load_dwordx4 -; EG: VTX_READ_128 +; EG: VTX_READ_64 +; EG: VTX_READ_32 define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 { entry: %ld = load <3 x i32>, <3 x i32> addrspace(2)* %in @@ -151,7 +152,8 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v4i32_to_v4i64: -; GCN: s_load_dwordx4 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 ; GCN: store_dwordx4 ; GCN: store_dwordx4 @@ -163,7 +165,8 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v4i32_to_v4i64: -; GCN: s_load_dwordx4 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 ; GCN: s_ashr_i32 ; GCN: s_ashr_i32 @@ -180,7 +183,10 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v8i32_to_v8i64: -; GCN: s_load_dwordx8 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-NOHSA-DAG: buffer_store_dwordx4 @@ -199,7 +205,10 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v8i32_to_v8i64: -; GCN: s_load_dwordx8 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 ; GCN: s_ashr_i32 ; GCN: s_ashr_i32 @@ -227,7 +236,14 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v16i32_to_v16i64: -; GCN: s_load_dwordx16 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 ; GCN-DAG: s_ashr_i32 @@ -248,7 +264,14 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v16i32_to_v16i64 -; GCN: s_load_dwordx16 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-NOHSA: buffer_store_dwordx4 @@ -265,8 +288,6 @@ ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in %ext = zext <16 x i32> %ld to <16 x i64> @@ -276,8 +297,22 @@ ; FUNC-LABEL: {{^}}constant_sextload_v32i32_to_v32i64: -; GCN: s_load_dwordx16 -; GCN-DAG: s_load_dwordx16 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN-DAG: s_load_dwordx2 +; GCN-DAG: s_load_dwordx2 +; GCN-DAG: s_load_dwordx2 +; GCN-DAG: s_load_dwordx2 +; GCN-DAG: s_load_dwordx2 +; GCN-DAG: s_load_dwordx2 +; GCN-DAG: s_load_dwordx2 +; GCN-DAG: s_load_dwordx2 ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-NOHSA-DAG: buffer_store_dwordx4 @@ -327,8 +362,22 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v32i32_to_v32i64: -; GCN: s_load_dwordx16 -; GCN: s_load_dwordx16 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-NOHSA-DAG: buffer_store_dwordx4 Index: test/CodeGen/AMDGPU/load-global-f32.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-f32.ll +++ test/CodeGen/AMDGPU/load-global-f32.ll @@ -33,7 +33,8 @@ ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -; R600: VTX_READ_128 +; R600: VTX_READ_64 +; R600: VTX_READ_32 define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { entry: %tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in Index: test/CodeGen/AMDGPU/load-global-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i32.ll +++ test/CodeGen/AMDGPU/load-global-i32.ll @@ -32,7 +32,8 @@ ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -; EG: VTX_READ_128 +; EG: VTX_READ_64 +; EG: VTX_READ_32 define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 { entry: %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in @@ -180,11 +181,13 @@ } ; FUNC-LABEL: {{^}}global_zextload_v4i32_to_v4i64: -; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-NOHSA: buffer_store_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { @@ -195,8 +198,10 @@ } ; FUNC-LABEL: {{^}}global_sextload_v4i32_to_v4i64: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 ; GCN-DAG: v_ashrrev_i32 ; GCN-DAG: v_ashrrev_i32 @@ -216,11 +221,15 @@ } ; FUNC-LABEL: {{^}}global_zextload_v8i32_to_v8i64: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-NOHSA-DAG: buffer_store_dwordx4 @@ -239,11 +248,15 @@ } ; FUNC-LABEL: {{^}}global_sextload_v8i32_to_v8i64: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 ; GCN-DAG: v_ashrrev_i32 ; GCN-DAG: v_ashrrev_i32 @@ -271,15 +284,23 @@ } ; FUNC-LABEL: {{^}}global_sextload_v16i32_to_v16i64: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 ; GCN-DAG: v_ashrrev_i32 @@ -317,15 +338,23 @@ } ; FUNC-LABEL: {{^}}global_zextload_v16i32_to_v16i64 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-NOHSA: buffer_store_dwordx4 @@ -353,23 +382,39 @@ ; FUNC-LABEL: {{^}}global_sextload_v32i32_to_v32i64: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA-DAG: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA-DAG: buffer_load_dwordx2 +; GCN-NOHSA-DAG: buffer_load_dwordx2 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 ; GCN-DAG: v_ashrrev_i32 ; GCN-DAG: v_ashrrev_i32 @@ -424,20 +469,20 @@ ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-NOHSA: buffer_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 @@ -452,23 +497,39 @@ } ; FUNC-LABEL: {{^}}global_zextload_v32i32_to_v32i64: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx2 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 +; GCN-HSA: flat_load_dwordx2 ; GCN-NOHSA-DAG: buffer_store_dwordx4 Index: test/CodeGen/AMDGPU/sad.ll =================================================================== --- test/CodeGen/AMDGPU/sad.ll +++ test/CodeGen/AMDGPU/sad.ll @@ -168,11 +168,13 @@ ret void } +; FIXME: This should lower to sad? + ; GCN-LABEL: {{^}}v_sad_u32_vector_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cmp_gt_u32_e64 s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cmp_gt_u32_e64 s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cmp_gt_u32_e64 s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { %icmp0 = icmp ugt <4 x i32> %a, %b %sub0 = sub <4 x i32> %a, %b Index: test/CodeGen/AMDGPU/vectorize-global-local.ll =================================================================== --- test/CodeGen/AMDGPU/vectorize-global-local.ll +++ test/CodeGen/AMDGPU/vectorize-global-local.ll @@ -1,8 +1,12 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -; CHECK-DAG: flat_load_dwordx4 -; CHECK-DAG: flat_load_dwordx4 -; CHECK-DAG: flat_load_dwordx4 -; CHECK-DAG: flat_load_dwordx4 +; CHECK-DAG: flat_load_dwordx2 +; CHECK-DAG: flat_load_dwordx2 +; CHECK-DAG: flat_load_dwordx2 +; CHECK-DAG: flat_load_dwordx2 +; CHECK-DAG: flat_load_dwordx2 +; CHECK-DAG: flat_load_dwordx2 +; CHECK-DAG: flat_load_dwordx2 +; CHECK-DAG: flat_load_dwordx2 ; CHECK-DAG: ds_write2_b32 ; CHECK-DAG: ds_write2_b32 ; CHECK-DAG: ds_write2_b32 Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -3648,13 +3648,13 @@ ; AVX512F-LABEL: sitofp_load_8i64_to_8f32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX512F-NEXT: vmovq %xmm2, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] @@ -3679,13 +3679,13 @@ ; AVX512VL-LABEL: sitofp_load_8i64_to_8f32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX512VL-NEXT: vmovq %xmm2, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] @@ -4579,13 +4579,13 @@ ; AVX512F-LABEL: uitofp_load_8i64_to_8f32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX512F-NEXT: vmovq %xmm2, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] @@ -4610,13 +4610,13 @@ ; AVX512VL-LABEL: uitofp_load_8i64_to_8f32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX512VL-NEXT: vmovq %xmm2, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]