Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -931,9 +931,10 @@ unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; - CurDAG->SelectNodeTo(N, Opc, N->getVTList(), - {N->getOperand(0), N->getOperand(1), - CurDAG->getConstant(0, {}, MVT::i1)/*clamp bit*/}); + CurDAG->SelectNodeTo( + N, Opc, N->getVTList(), + {N->getOperand(0), N->getOperand(1), + CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); } void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { Index: test/CodeGen/AMDGPU/sremi64.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sremi64.ll @@ -0,0 +1,40 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs <%s | FileCheck -check-prefixes=GCN,GFX678 %s +; RUN: llc -march=amdgcn -mcpu=gfx704 -verify-machineinstrs <%s | FileCheck -check-prefixes=GCN,GFX678 %s +; RUN: llc -march=amdgcn -mcpu=gfx802 -verify-machineinstrs <%s | FileCheck -check-prefixes=GCN,GFX678 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs <%s | FileCheck -check-prefixes=GCN,GFX9 %s + +; GCN-LABEL: wrapper: +; GFX678: v_add_{{[iu]}}32_e32 +; GFX9: v_add_co_u32_e32 +; GFX678: v_addc_u32_e32 +; GFX9: v_addc_co_u32_e32 + +; This was lowering the i64 srem with V_ADD_I32_e64 etc ops without the required clamp bit operand. + +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1 immarg, i1 immarg) +declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg, i1 immarg) + +define amdgpu_gs void @wrapper(i32 inreg %arg4) { +main_body: + %tmp = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 4, i32 0) + %tmp14 = bitcast float %tmp to i32 + %tmp15 = insertelement <2 x i32> undef, i32 %tmp14, i32 1 + %tmp16 = bitcast <2 x i32> %tmp15 to i64 + %tmp17 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 32, i32 0) + %tmp18 = bitcast float %tmp17 to i32 + %tmp19 = insertelement <2 x i32> undef, i32 %tmp18, i32 0 + %tmp20 = insertelement <2 x i32> %tmp19, i32 undef, i32 1 + %tmp21 = bitcast <2 x i32> %tmp20 to i64 + %tmp22 = srem i64 %tmp16, %tmp21 + %tmp23 = icmp eq i64 %tmp22, 0 + %tmp24 = icmp eq i64 0, 0 + %tmp25 = icmp eq i64 0, 0 + %tmp26 = and i1 %tmp24, %tmp25 + %tmp27 = and i1 %tmp23, %tmp26 + %tmp28 = select i1 %tmp27, i32 0, i32 1065353216 + call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 64, i32 %arg4, i32 0, i32 4, i32 4, i1 true, i1 true) + call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp28, <4 x i32> undef, i32 0, i32 56, i32 %arg4, i32 0, i32 4, i32 4, i1 true, i1 true) + ret void +} +