Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1454,9 +1454,13 @@ ConstantSDNode *C1 = cast(N1); // (add n0, c0) - Base = N0; - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); - return true; + // Don't peel off the offset (c0) if doing so could possibly lead + // the base (n0) to be negative. + if (CurDAG->SignBitIsZero(N0) || C1->getSExtValue() <= 0) { + Base = N0; + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); + return true; + } } if (isa(Index)) Index: test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll @@ -14,8 +14,9 @@ ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]] ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] +; GCN-DAG: v_add_u32_e32 [[IDX1:v[0-9]+]], 1, [[IDX0]] + ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] ; GCN: s_and_saveexec_b64 vcc, vcc @@ -36,8 +37,8 @@ ; GCN: s_mov_b64 [[MASK]], exec ; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] -; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX1]] +; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX1]] ; GCN: s_and_saveexec_b64 vcc, vcc ; MOVREL: s_mov_b32 m0, [[READLANE]] Index: test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll @@ -17,8 +17,9 @@ ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]] ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] +; GCN-DAG: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], vcc, 1, [[IDX0]] + ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] ; GCN: s_and_saveexec_b64 vcc, vcc @@ -39,8 +40,8 @@ ; GCN: s_mov_b64 [[MASK]], exec ; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] -; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX1]] +; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX1]] ; GCN: s_and_saveexec_b64 vcc, vcc ; MOVREL: s_mov_b32 m0, [[READLANE]] Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -7,11 +7,12 @@ ; indexing of vectors. ; GCN-LABEL: {{^}}extract_w_offset: -; GCN-DAG: s_load_dword [[IN:s[0-9]+]] +; GCN-DAG: s_load_dword [[IN0:s[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 -; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 +; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0 +; GCN-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1 ; MOVREL-DAG: s_mov_b32 m0, [[IN]] ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]] @@ -29,16 +30,17 @@ ; XXX: Could do v_or_b32 directly ; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector: -; MOVREL: s_mov_b32 m0 ; GCN-DAG: s_or_b32 ; GCN-DAG: s_or_b32 ; GCN-DAG: s_or_b32 ; GCN-DAG: s_or_b32 +; MOVREL: s_mov_b32 m0 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} + ; MOVREL: v_movrels_b32_e32 ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}} @@ -176,7 +178,8 @@ } ; GCN-LABEL: {{^}}insert_w_offset: -; GCN-DAG: s_load_dword [[IN:s[0-9]+]] +; GCN-DAG: s_load_dword [[IN0:s[0-9]+]] +; MOVREL-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1 ; MOVREL-DAG: s_mov_b32 m0, [[IN]] ; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0 ; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0 @@ -185,7 +188,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000 ; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000 -; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]] +; MOVREL: v_movreld_b32_e32 v[[ELT0]], v[[INS]] ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}} define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) { entry: @@ -195,6 +198,51 @@ ret void } +; GCN-LABEL: {{^}}insert_unsigned_base_plus_offset: +; GCN-DAG: s_load_dword [[IN:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0 +; GCN-DAG: s_and_b32 [[BASE:s[0-9]+]], [[IN]], 0xffff + +; MOVREL: s_mov_b32 m0, [[BASE]] +; MOVREL: v_movreld_b32_e32 [[ELT1]], v{{[0-9]+}} + +; IDXMODE: s_set_gpr_idx_on [[BASE]], dst +; IDXMODE-NEXT: v_mov_b32_e32 [[ELT1]], v{{[0-9]+}} +; IDXMODE-NEXT: s_set_gpr_idx_off +define amdgpu_kernel void @insert_unsigned_base_plus_offset(<16 x float> addrspace(1)* %out, i16 %in) { +entry: + %base = zext i16 %in to i32 + %add = add i32 %base, 1 + %ins = insertelement <16 x float> , float 17.0, i32 %add + store <16 x float> %ins, <16 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}insert_signed_base_plus_offset: +; GCN-DAG: s_load_dword [[IN:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0 + +; GCN-DAG: s_sext_i32_i16 [[BASE:s[0-9]+]], [[IN]] +; GCN-DAG: s_add_i32 [[BASE_PLUS_OFFSET:s[0-9]+]], [[BASE]], 1 + +; MOVREL: s_mov_b32 m0, [[BASE_PLUS_OFFSET]] +; MOVREL: v_movreld_b32_e32 [[ELT0]], v{{[0-9]+}} + +; IDXMODE: s_set_gpr_idx_on [[BASE_PLUS_OFFSET]], dst +; IDXMODE-NEXT: v_mov_b32_e32 [[ELT0]], v{{[0-9]+}} +; IDXMODE-NEXT: s_set_gpr_idx_off +define amdgpu_kernel void @insert_signed_base_plus_offset(<16 x float> addrspace(1)* %out, i16 %in) { +entry: + %base = sext i16 %in to i32 + %add = add i32 %base, 1 + %ins = insertelement <16 x float> , float 17.0, i32 %add + store <16 x float> %ins, <16 x float> addrspace(1)* %out + ret void +} + + ; GCN-LABEL: {{^}}insert_wo_offset: ; GCN: s_load_dword [[IN:s[0-9]+]] @@ -354,8 +402,12 @@ ; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec +; GCN: s_waitcnt vmcnt(0) +; PREGFX9: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], vcc, 1, [[IDX0]] +; GFX9: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], 1, [[IDX0]] + + ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] ; GCN: s_and_saveexec_b64 vcc, vcc @@ -373,20 +425,20 @@ ; FIXME: Redundant copy ; GCN: s_mov_b64 exec, [[MASK]] -; GCN: v_mov_b32_e32 [[VEC_ELT1_2:v[0-9]+]], [[S_ELT1]] +; GCN: v_mov_b32_e32 [[VEC_ELT0_2:v[0-9]+]], [[S_ELT0]] ; GCN: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec ; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] -; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX1]] +; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX1]] ; GCN: s_and_saveexec_b64 vcc, vcc ; MOVREL: s_mov_b32 m0, [[READLANE]] -; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]] +; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT0_2]] ; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0 -; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]] +; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT0_2]] ; IDXMODE: s_set_gpr_idx_off ; GCN-NEXT: s_xor_b64 exec, exec, vcc @@ -492,13 +544,15 @@ ; offset puts outside of superegister bounaries, so clamp to 1st element. ; GCN-LABEL: {{^}}extract_largest_inbounds_offset: -; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\].* offset:48}} -; GCN-DAG: s_load_dword [[IDX:s[0-9]+]] +; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]] +; GCN-DAG: s_load_dword [[IDX0:s[0-9]+]] +; GCN-DAG: s_add_i32 [[IDX:s[0-9]+]], [[IDX0]], 15 + ; MOVREL: s_mov_b32 m0, [[IDX]] -; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]] +; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] ; IDXMODE: s_set_gpr_idx_on [[IDX]], src0 -; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]] +; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword [[EXTRACT]] @@ -514,10 +568,11 @@ ; GCN-LABEL: {{^}}extract_out_of_bounds_offset: ; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}} ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]] -; MOVREL: s_add_i32 m0, [[IDX]], 16 +; GCN: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16 + +; MOVREL: s_mov_b32 m0, [[ADD_IDX]] ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] -; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0 ; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] ; IDXMODE: s_set_gpr_idx_off @@ -532,18 +587,15 @@ ret void } -; Test that the or is folded into the base address register instead of -; added to m0 - ; GCN-LABEL: {{^}}extractelement_v16i32_or_index: ; GCN: s_load_dword [[IDX_IN:s[0-9]+]] ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] -; GCN-NOT: [[IDX_SHL]] +; GCN: s_or_b32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1 -; MOVREL: s_mov_b32 m0, [[IDX_SHL]] +; MOVREL: s_mov_b32 m0, [[IDX_FIN]] ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0 +; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], src0 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off define amdgpu_kernel void @extractelement_v16i32_or_index(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx.in) { @@ -559,12 +611,12 @@ ; GCN-LABEL: {{^}}insertelement_v16f32_or_index: ; GCN: s_load_dword [[IDX_IN:s[0-9]+]] ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] -; GCN-NOT: [[IDX_SHL]] +; GCN: s_or_b32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1 -; MOVREL: s_mov_b32 m0, [[IDX_SHL]] +; MOVREL: s_mov_b32 m0, [[IDX_FIN]] ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst +; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], dst ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind {