Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -399,6 +399,8 @@ setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::BITCAST); + setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); @@ -2547,6 +2549,38 @@ switch(N->getOpcode()) { default: break; + case ISD::BITCAST: { + EVT DestVT = N->getValueType(0); + if (DestVT.getSizeInBits() != 64 && !DestVT.isVector()) + break; + + // Fold bitcasts of constants. + // + // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k) + // TODO: Generalize and move to DAGCombiner + SDValue Src = N->getOperand(0); + if (ConstantSDNode *C = dyn_cast(Src)) { + assert(Src.getValueType() == MVT::i64); + SDLoc SL(N); + uint64_t CVal = C->getZExtValue(); + return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, + DAG.getConstant(Lo_32(CVal), SL, MVT::i32), + DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + } + + if (ConstantFPSDNode *C = dyn_cast(Src)) { + const APInt &Val = C->getValueAPF().bitcastToAPInt(); + SDLoc SL(N); + uint64_t CVal = Val.getZExtValue(); + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + DAG.getConstant(Lo_32(CVal), SL, MVT::i32), + DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + + return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec); + } + + break; + } case ISD::SHL: { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) break; Index: test/CodeGen/AMDGPU/fceil64.ll =================================================================== --- test/CodeGen/AMDGPU/fceil64.ll +++ test/CodeGen/AMDGPU/fceil64.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s @@ -25,7 +25,7 @@ ; SI-DAG: cndmask_b32 ; SI-DAG: v_cmp_lt_f64 ; SI-DAG: v_cmp_lg_f64 -; SI: s_and_b64 +; SI-DAG: s_and_b64 ; SI: v_cndmask_b32 ; SI: v_cndmask_b32 ; SI: v_add_f64 Index: test/CodeGen/AMDGPU/sdivrem64.ll =================================================================== --- test/CodeGen/AMDGPU/sdivrem64.ll +++ test/CodeGen/AMDGPU/sdivrem64.ll @@ -1,8 +1,8 @@ -;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s ;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s ;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s -;FUNC-LABEL: {{^}}test_sdiv: +;FUNC-LABEL: {{^}}s_test_sdiv: ;EG: RECIP_UINT ;EG: LSHL {{.*}}, 1, ;EG: BFE_UINT @@ -36,47 +36,47 @@ ;EG: BFE_UINT ;EG: BFE_UINT -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN: s_bfe_u32 +; GCN-NOT: v_mad_f32 +; SI-NOT: v_lshr_b64 +; VI-NOT: v_lshrrev_b64 +; GCN: s_endpgm +define void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = sdiv i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void } -;FUNC-LABEL: {{^}}test_srem: +;FUNC-LABEL: {{^}}s_test_srem: ;EG: RECIP_UINT ;EG: BFE_UINT ;EG: BFE_UINT @@ -144,7 +144,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/sint_to_fp.f64.ll =================================================================== --- test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -10,14 +10,14 @@ ret void } -; FIXME: select on 0, 0 ; SI-LABEL: {{^}}sint_to_fp_i1_f64: ; SI: v_cmp_eq_i32_e64 vcc, ; We can't fold the SGPRs into v_cndmask_b32_e64, because it already ; uses an SGPR (implicit vcc). -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc -; SI: buffer_store_dwordx2 +; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}} +; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}} + ; SI: s_endpgm define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { %cmp = icmp eq i32 %in, 0 Index: test/CodeGen/AMDGPU/uint_to_fp.f64.ll =================================================================== --- test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -70,14 +70,13 @@ ret void } -; FIXME: select on 0, 0 ; SI-LABEL: {{^}}uint_to_fp_i1_to_f64: ; SI: v_cmp_eq_i32_e64 vcc ; We can't fold the SGPRs into v_cndmask_b32_e32, because it already ; uses an SGPR (implicit vcc). -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc -; SI: buffer_store_dwordx2 +; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}} +; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}} ; SI: s_endpgm define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { %cmp = icmp eq i32 %in, 0