Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2595,27 +2595,49 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const { - if (N->getValueType(0) != MVT::i64) + EVT VT = N->getValueType(0); + if (VT != MVT::i64) return SDValue(); - // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) - - // On some subtargets, 64-bit shift is a quarter rate instruction. In the - // common case, splitting this into a move and a 32-bit shift is faster and - // the same code size. - const ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); + ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); if (!RHS) return SDValue(); - unsigned RHSVal = RHS->getZExtValue(); - if (RHSVal < 32) - return SDValue(); - SDValue LHS = N->getOperand(0); + unsigned RHSVal = RHS->getZExtValue(); + if (!RHSVal) + return LHS; SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; + switch (LHS->getOpcode()) { + default: + break; + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ANY_EXTEND: { + // shl (ext x) => zext (shl x), if shift does not overflow int + KnownBits Known; + SDValue X = LHS->getOperand(0); + DAG.computeKnownBits(X, Known); + unsigned LZ = Known.countMinLeadingZeros(); + if (LZ < RHSVal) + break; + EVT XVT = X.getValueType(); + SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); + return DAG.getZExtOrTrunc(Shl, SL, VT); + } + } + + // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) + + // On some subtargets, 64-bit shift is a quarter rate instruction. In the + // common case, splitting this into a move and a 32-bit shift is faster and + // the same code size. + if (RHSVal < 32) + return SDValue(); + SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); Index: llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll @@ -84,11 +84,10 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64: -; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] -; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} +; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:{{[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid Index: llvm/trunk/test/CodeGen/AMDGPU/add.v2i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/add.v2i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/add.v2i16.ll @@ -202,10 +202,10 @@ ; VI: flat_load_ushort v[[B_LO:[0-9]+]] ; VI: flat_load_ushort v[[B_HI:[0-9]+]] -; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; VI: v_add_u16_e32 -; VI: v_add_u16_e32 +; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; VI-DAG: v_add_u16_e32 +; VI-DAG: v_add_u16_e32 ; VI: buffer_store_dwordx4 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { Index: llvm/trunk/test/CodeGen/AMDGPU/bfe-patterns.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/bfe-patterns.ll +++ llvm/trunk/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -50,7 +50,7 @@ ; GCN-LABEL: {{^}}s_ubfe_sub_i32: ; GCN: s_load_dword [[SRC:s[0-9]+]] ; GCN: s_load_dword [[WIDTH:s[0-9]+]] -; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}} +; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]] ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -128,7 +128,7 @@ ; GCN-LABEL: {{^}}s_sbfe_sub_i32: ; GCN: s_load_dword [[SRC:s[0-9]+]] ; GCN: s_load_dword [[WIDTH:s[0-9]+]] -; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}} +; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]] ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll @@ -135,7 +135,6 @@ } ; FUNC-LABEL: {{^}}v_ctlz_i64: -; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] @@ -145,7 +144,7 @@ ; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]] ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]] ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc -; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}} +; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI:[0-9]+]]{{\]}} define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid Index: llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -121,8 +121,7 @@ ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]] -; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} -; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}} +; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI:[0-9]+]]{{\]}} define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid Index: llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll @@ -266,8 +266,8 @@ } ; SI-LABEL: @simple_write2_one_val_f64 -; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; SI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} ; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 ; SI: s_endpgm define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/fmed3.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fmed3.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fmed3.ll @@ -845,10 +845,10 @@ ; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] -; GCN: v_min_f32 -; GCN: v_max_f32 -; GCN: v_min_f32 -; GCN: v_max_f32 +; GCN-DAG: v_min_f32 +; GCN-DAG: v_max_f32 +; GCN-DAG: v_min_f32 +; GCN-DAG: v_max_f32 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -356,6 +356,7 @@ ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64: ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} @@ -371,6 +372,7 @@ ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64: ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -207,6 +207,7 @@ ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64: ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} @@ -222,6 +223,7 @@ ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64: ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} Index: llvm/trunk/test/CodeGen/AMDGPU/lshl64-to-32.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/lshl64-to-32.ll +++ llvm/trunk/test/CodeGen/AMDGPU/lshl64-to-32.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn < %s | FileCheck %s + +; CHECK-LABEL: {{^}}zext_shl64_to_32: +; CHECK: s_lshl_b32 +; CHECK-NOT: s_lshl_b64 +define amdgpu_kernel void @zext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) { + %and = and i32 %x, 1073741823 + %ext = zext i32 %and to i64 + %shl = shl i64 %ext, 2 + store i64 %shl, i64 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}sext_shl64_to_32: +; CHECK: s_lshl_b32 +; CHECK-NOT: s_lshl_b64 +define amdgpu_kernel void @sext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) { + %and = and i32 %x, 536870911 + %ext = sext i32 %and to i64 + %shl = shl i64 %ext, 2 + store i64 %shl, i64 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}zext_shl64_overflow: +; CHECK: s_lshl_b64 +; CHECK-NOT: s_lshl_b32 +define amdgpu_kernel void @zext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) { + %and = and i32 %x, 2147483647 + %ext = zext i32 %and to i64 + %shl = shl i64 %ext, 2 + store i64 %shl, i64 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}sext_shl64_overflow: +; CHECK: s_lshl_b64 +; CHECK-NOT: s_lshl_b32 +define amdgpu_kernel void @sext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) { + %and = and i32 %x, 2147483647 + %ext = sext i32 %and to i64 + %shl = shl i64 %ext, 2 + store i64 %shl, i64 addrspace(1)* %out, align 4 + ret void +} Index: llvm/trunk/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -299,10 +299,10 @@ } ; GCN-LABEL: {{^}}and_not_mask_i64: -; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} -; GCN: v_mov_b32_e32 v[[SHRHI]], 0{{$}} +; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} +; GCN: v_mov_b32_e32 v[[SHRHI:[0-9]+]], 0{{$}} ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]] -; GCN-DAG: v_and_b32_e32 v[[SHRLO]], 4, [[SHR]] +; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]] ; GCN-NOT: v[[SHRLO]] ; GCN-NOT: v[[SHRHI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} @@ -360,10 +360,9 @@ } ; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3 -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} +; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:{{[0-9]+\]}} ; GCN: buffer_store_dword v[[ZERO]] define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/trunk/test/CodeGen/AMDGPU/srl.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/srl.ll +++ llvm/trunk/test/CodeGen/AMDGPU/srl.ll @@ -201,7 +201,8 @@ ; GCN-LABEL: {{^}}v_lshr_32_i64: ; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[VHI1:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], v[[VHI1]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}} define amdgpu_kernel void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 Index: llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll @@ -85,9 +85,9 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64: -; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] +; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {