diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1902,7 +1902,9 @@ // (add n0, c0) // Don't peel off the offset (c0) if doing so could possibly lead // the base (n0) to be negative. - if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) { + // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset. + if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) || + (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) { Base = N0; Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); return true; diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -160,16 +160,14 @@ ret void } -; TODO: Should be able to copy to m0 only once and increment base instead. - ; GCN-LABEL: {{^}}double8_extelt: +; GCN-NOT: buffer_ +; GCN-NOT: s_or_b32 ; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ZERO]] -; GCN-DAG: s_mov_b32 m0, [[IND0:s[0-9]+]] -; GCN-DAG: s_or_b32 [[IND1:s[0-9]+]], [[IND0]], 1 -; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], [[BASE]] -; GCN: s_mov_b32 m0, [[IND1:s[0-9]+]] -; GCN: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], [[BASE]] +; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]] +; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] +; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]] +; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]] ; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]] define amdgpu_kernel void @double8_extelt(double addrspace(1)* %out, i32 %sel) { entry: @@ -179,13 +177,13 @@ } ; GCN-LABEL: {{^}}double7_extelt: +; GCN-NOT: buffer_ +; GCN-NOT: s_or_b32 ; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ZERO]] -; GCN-DAG: s_mov_b32 m0, [[IND0:s[0-9]+]] -; GCN-DAG: s_or_b32 [[IND1:s[0-9]+]], [[IND0]], 1 -; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], [[BASE]] -; GCN: s_mov_b32 m0, [[IND1:s[0-9]+]] -; GCN: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], [[BASE]] +; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]] +; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] +; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]] +; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]] ; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]] define amdgpu_kernel void @double7_extelt(double addrspace(1)* %out, i32 %sel) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -590,12 +590,11 @@ ; GCN-LABEL: {{^}}extractelement_v16i32_or_index: ; GCN: s_load_dword [[IDX_IN:s[0-9]+]] ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] -; GCN: s_or_b32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1 -; MOVREL: s_mov_b32 m0, [[IDX_FIN]] +; MOVREL: s_mov_b32 m0, [[IDX_SHL]] ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], gpr_idx(SRC0) +; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(SRC0) ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off define amdgpu_kernel void @extractelement_v16i32_or_index(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx.in) { @@ -611,12 +610,11 @@ ; GCN-LABEL: {{^}}insertelement_v16f32_or_index: ; GCN: s_load_dword [[IDX_IN:s[0-9]+]] ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] -; GCN: s_or_b32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1 -; MOVREL: s_mov_b32 m0, [[IDX_FIN]] +; MOVREL: s_mov_b32 m0, [[IDX_SHL]] ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], gpr_idx(DST) +; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(DST) ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind { diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -268,16 +268,14 @@ ret void } -; TODO: We should be able not to write to m0 twice and just increment base. - ; GCN-LABEL: {{^}}double8_inselt: ; GCN-NOT: v_cndmask ; GCN-NOT: buffer_ -; GCN-DAG: s_or_b32 [[IND1:s[0-9]+]], [[IND0:s[0-9]+]], 1 -; GCN-DAG: s_mov_b32 m0, [[IND0]] -; GCN-DAG: v_movreld_b32_e32 [[BASE:v[0-9]+]], -; GCN: s_mov_b32 m0, [[IND1]] -; GCN: v_movreld_b32_e32 [[BASE]] +; GCN-NOT: s_or_b32 +; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] +; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: v_movreld_b32_e32 v[[#BASE+1]], define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) { entry: %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel @@ -288,11 +286,11 @@ ; GCN-LABEL: {{^}}double7_inselt: ; GCN-NOT: v_cndmask ; GCN-NOT: buffer_ -; GCN-DAG: s_or_b32 [[IND1:s[0-9]+]], [[IND0:s[0-9]+]], 1 -; GCN-DAG: s_mov_b32 m0, [[IND0]] -; GCN-DAG: v_movreld_b32_e32 [[BASE:v[0-9]+]], -; GCN: s_mov_b32 m0, [[IND1]] -; GCN: v_movreld_b32_e32 [[BASE]] +; GCN-NOT: s_or_b32 +; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] +; GCN-DAG: v_movreld_b32_e32 v[[#BASE]], 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: v_movreld_b32_e32 v[[#BASE+1]], define amdgpu_kernel void @double7_inselt(<7 x double> addrspace(1)* %out, <7 x double> %vec, i32 %sel) { entry: %v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1643,7 +1643,6 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_lshl_b32 s4, s4, 1 -; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 @@ -1659,10 +1658,9 @@ ; SI-NEXT: v_mov_b32_e32 v13, s21 ; SI-NEXT: v_mov_b32_e32 v14, s22 ; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: s_or_b32 s4, s4, 1 -; SI-NEXT: v_movreld_b32_e32 v0, 0 ; SI-NEXT: s_mov_b32 m0, s4 -; SI-NEXT: v_movreld_b32_e32 v0, v16 +; SI-NEXT: v_movreld_b32_e32 v0, 0 +; SI-NEXT: v_movreld_b32_e32 v1, v16 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -1680,7 +1678,6 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_lshl_b32 s4, s4, 1 -; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 @@ -1696,10 +1693,9 @@ ; VI-NEXT: v_mov_b32_e32 v13, s21 ; VI-NEXT: v_mov_b32_e32 v14, s22 ; VI-NEXT: v_mov_b32_e32 v15, s23 -; VI-NEXT: s_or_b32 s4, s4, 1 -; VI-NEXT: v_movreld_b32_e32 v0, 0 ; VI-NEXT: s_mov_b32 m0, s4 -; VI-NEXT: v_movreld_b32_e32 v0, v16 +; VI-NEXT: v_movreld_b32_e32 v0, 0 +; VI-NEXT: v_movreld_b32_e32 v1, v16 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16