Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -83,6 +83,7 @@ SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performTruncCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -563,6 +563,7 @@ setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::MULHU); setTargetDAGCombine(ISD::MULHS); @@ -2757,6 +2758,30 @@ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); } +SDValue AMDGPUTargetLowering::performTruncCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + if (VT != MVT::i32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDValue Op = N->getOperand(0); + + // trunc i64 (shl x, y) to i32 -> i32 (shl (trunc x to i32), y) | y < 32 + if (Op.getOpcode() != ISD::SHL || Op.getValueType() != MVT::i64 || + !Op.hasOneUse()) + return SDValue(); + + auto RHS = Op.getOperand(1); + KnownBits Known; + DAG.computeKnownBits(RHS, Known); + if (Known.getBitWidth() - Known.countMinLeadingZeros() > 5) + return SDValue(); + + SDValue Lo = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op.getOperand(0)); + return DAG.getNode(ISD::SHL, SDLoc(Op), MVT::i32, Lo, RHS); +} + // We need to specifically handle i64 mul here to avoid unnecessary conversion // instructions. If we only match on the legalized i64 mul expansion, // SimplifyDemandedBits will be unable to remove them because there will be @@ -3394,6 +3419,8 @@ return performSraCombine(N, DCI); } + case ISD::TRUNCATE: + return performTruncCombine(N, DCI); case ISD::MUL: return performMulCombine(N, DCI); case ISD::MULHS: Index: test/CodeGen/AMDGPU/alignbit-pat.ll =================================================================== --- test/CodeGen/AMDGPU/alignbit-pat.ll +++ test/CodeGen/AMDGPU/alignbit-pat.ll @@ -16,23 +16,6 @@ ret void } -; GCN-LABEL: {{^}}alignbit_shl_pat: -; GCN-DAG: s_load_dword s[[SHL:[0-9]+]] -; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: s_sub_i32 s[[SHR:[0-9]+]], 32, s[[SHL]] -; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], s[[SHR]] - -define amdgpu_kernel void @alignbit_shl_pat(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { -bb: - %tmp = load i64, i64 addrspace(1)* %arg, align 8 - %tmp3 = and i32 %arg2, 31 - %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = shl i64 %tmp, %tmp4 - %tmp6 = trunc i64 %tmp5 to i32 - store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 - ret void -} - ; GCN-LABEL: {{^}}alignbit_shr_pat_v: ; GCN-DAG: load_dword v[[SHR:[0-9]+]], ; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} @@ -53,27 +36,6 @@ ret void } -; GCN-LABEL: {{^}}alignbit_shl_pat_v: -; GCN-DAG: load_dword v[[SHL:[0-9]+]], -; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_sub_i32_e32 v[[SHR:[0-9]+]], {{[^,]+}}, 32, v[[SHL]] -; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], v[[SHR]] - -define amdgpu_kernel void @alignbit_shl_pat_v(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1) { -bb: - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() - %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tid - %tmp = load i64, i64 addrspace(1)* %gep1, align 8 - %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tid - %amt = load i32, i32 addrspace(1)* %gep2, align 4 - %tmp3 = and i32 %amt, 31 - %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = shl i64 %tmp, %tmp4 - %tmp6 = trunc i64 %tmp5 to i32 - store i32 %tmp6, i32 addrspace(1)* %gep2, align 4 - ret void -} - ; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and30: ; Negative test, wrong constant ; GCN: v_lshr_b64 @@ -90,22 +52,6 @@ ret void } -; GCN-LABEL: {{^}}alignbit_shl_pat_wrong_and30: -; Negative test, wrong constant -; GCN: v_lshl_b64 -; GCN-NOT: v_alignbit_b32 - -define amdgpu_kernel void @alignbit_shl_pat_wrong_and30(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { -bb: - %tmp = load i64, i64 addrspace(1)* %arg, align 8 - %tmp3 = and i32 %arg2, 30 - %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = shl i64 %tmp, %tmp4 - %tmp6 = trunc i64 %tmp5 to i32 - store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 - ret void -} - ; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and63: ; Negative test, wrong constant ; GCN: v_lshr_b64 @@ -122,21 +68,6 @@ ret void } -; GCN-LABEL: {{^}}alignbit_shl_pat_wrong_and63: -; Negative test, wrong constant -; GCN: v_lshl_b64 -; GCN-NOT: v_alignbit_b32 - -define amdgpu_kernel void @alignbit_shl_pat_wrong_and63(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { -bb: - %tmp = load i64, i64 addrspace(1)* %arg, align 8 - %tmp3 = and i32 %arg2, 63 - %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = shl i64 %tmp, %tmp4 - %tmp6 = trunc i64 %tmp5 to i32 - store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 - ret void -} declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/shift-i64-opts.ll =================================================================== --- test/CodeGen/AMDGPU/shift-i64-opts.ll +++ test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -243,3 +243,62 @@ store volatile i64 %shl, i64 addrspace(1)* %in ret void } + +; GCN-LABEL: {{^}}trunc_shl_and31: +; GCN: s_and_b32 s[[AMT:[0-9]+]], s{{[0-9]+}}, 31 +; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, s[[AMT]], v{{[0-9]+}} +; GCN-NOT: v_lshl_b64 +; GCN-NOT: v_lshlrev_b64 +define amdgpu_kernel void @trunc_shl_and31(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { +bb: + %tmp = load i64, i64 addrspace(1)* %arg, align 8 + %tmp3 = and i32 %arg2, 31 + %tmp4 = zext i32 %tmp3 to i64 + %tmp5 = shl i64 %tmp, %tmp4 + %tmp6 = trunc i64 %tmp5 to i32 + store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 + ret void +} + +; GCN-LABEL: {{^}}trunc_shl_and30: +; GCN: s_and_b32 s[[AMT:[0-9]+]], s{{[0-9]+}}, 30 +; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, s[[AMT]], v{{[0-9]+}} +; GCN-NOT: v_lshl_b64 +; GCN-NOT: v_lshlrev_b64 +define amdgpu_kernel void @trunc_shl_and30(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { +bb: + %tmp = load i64, i64 addrspace(1)* %arg, align 8 + %tmp3 = and i32 %arg2, 30 + %tmp4 = zext i32 %tmp3 to i64 + %tmp5 = shl i64 %tmp, %tmp4 + %tmp6 = trunc i64 %tmp5 to i32 + store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 + ret void +} + +; GCN-LABEL: {{^}}trunc_shl_wrong_and63: +; Negative test, wrong constant +; GCN: v_lshl_b64 +define amdgpu_kernel void @trunc_shl_wrong_and63(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { +bb: + %tmp = load i64, i64 addrspace(1)* %arg, align 8 + %tmp3 = and i32 %arg2, 63 + %tmp4 = zext i32 %tmp3 to i64 + %tmp5 = shl i64 %tmp, %tmp4 + %tmp6 = trunc i64 %tmp5 to i32 + store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 + ret void +} + +; GCN-LABEL: {{^}}trunc_shl_no_and: +; Negative test, shift can be full 64 bit +; GCN: v_lshl_b64 +define amdgpu_kernel void @trunc_shl_no_and(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { +bb: + %tmp = load i64, i64 addrspace(1)* %arg, align 8 + %tmp4 = zext i32 %arg2 to i64 + %tmp5 = shl i64 %tmp, %tmp4 + %tmp6 = trunc i64 %tmp5 to i32 + store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 + ret void +}