Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -5116,6 +5116,11 @@ Denominator, Numerator); } case Intrinsic::amdgcn_icmp: { + // There is a Pat that handles this variant, so return it as-is. + if (Op.getOperand(1).getValueType() == MVT::i1 && + Op.getConstantOperandVal(2) == 0 && + Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE) + return Op; return lowerICMPIntrinsic(*this, Op.getNode(), DAG); } case Intrinsic::amdgcn_fcmp: { Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -583,6 +583,11 @@ // TODO: we could add more variants for other types of conditionals +def : Pat < + (int_amdgcn_icmp i1:$src, (i1 0), (i32 33)), + (COPY $src) // Return the SGPRs representing i1 src +>; + //===----------------------------------------------------------------------===// // VOP1 Patterns //===----------------------------------------------------------------------===// Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3709,6 +3709,11 @@ // Promote to next legal integer type. unsigned Width = CmpType->getBitWidth(); unsigned NewWidth = Width; + + // Don't do anything for i1 comparisons. + if (Width == 1) + break; + if (Width <= 16) NewWidth = 16; else if (Width <= 32) Index: test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll @@ -4,6 +4,7 @@ declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0 declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #0 declare i64 @llvm.amdgcn.icmp.i16(i16, i16, i32) #0 +declare i64 @llvm.amdgcn.icmp.i1(i1, i1, i32) #0 ; No crash on invalid input ; GCN-LABEL: {{^}}v_icmp_i32_dynamic_cc: @@ -314,4 +315,21 @@ ret void } +; GCN-LABEL: {{^}}v_icmp_i1_ne0: +; GCN: v_cmp_gt_u32_e64 s[[C0:\[[0-9]+:[0-9]+\]]], +; GCN: v_cmp_gt_u32_e64 s[[C1:\[[0-9]+:[0-9]+\]]], +; GCN: s_and_b64 s[[SRC:\[[0-9]+:[0-9]+\]]], s[[C0]], s[[C1]] +; SI-NEXT: s_mov_b32 s{{[0-9]+}}, -1 +; GCN-NEXT: v_mov_b32_e32 +; GCN-NEXT: v_mov_b32_e32 +; GCN-NEXT: {{global|flat|buffer}}_store_dwordx2 +define amdgpu_kernel void @v_icmp_i1_ne0(i64 addrspace(1)* %out, i32 %a, i32 %b) { + %c0 = icmp ugt i32 %a, 1 + %c1 = icmp ugt i32 %b, 2 + %src = and i1 %c0, %c1 + %result = call i64 @llvm.amdgcn.icmp.i1(i1 %src, i1 false, i32 33) + store i64 %result, i64 addrspace(1)* %out + ret void +} + attributes #0 = { nounwind readnone convergent } Index: test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll =================================================================== --- test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -1406,6 +1406,7 @@ declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) nounwind readnone convergent declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) nounwind readnone convergent +declare i64 @llvm.amdgcn.icmp.i1(i1, i1, i32) nounwind readnone convergent ; Make sure there's no crash for invalid input ; CHECK-LABEL: @invalid_nonconstant_icmp_code( @@ -1815,6 +1816,198 @@ ret i64 %mask } +; 1-bit NE comparisons + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i1( +; CHECK-NEXT: icmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_eq_i1(i32 %a, i32 %b) { + %cmp = icmp eq i32 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ne_i1( +; CHECK-NEXT: icmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_ne_i1(i32 %a, i32 %b) { + %cmp = icmp ne i32 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_sle_i1( +; CHECK-NEXT: icmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_sle_i1(i32 %a, i32 %b) { + %cmp = icmp sle i32 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ugt_i64( +; CHECK-NEXT: icmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_ugt_i64(i64 %a, i64 %b) { + %cmp = icmp ugt i64 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_swap_i64( +; CHECK-NEXT: icmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_ult_swap_i64(i64 %a, i64 %b) { + %cmp = icmp ugt i64 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 false, i1 %cmp, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f32( +; CHECK-NEXT: fcmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f32(float %a, float %b) { + %cmp = fcmp oeq float %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_une_f32( +; CHECK-NEXT: fcmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_fcmp_une_f32(float %a, float %b) { + %cmp = fcmp une float %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_olt_f64( +; CHECK-NEXT: fcmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_fcmp_olt_f64(double %a, double %b) { + %cmp = fcmp olt double %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i4( +; CHECK-NEXT: icmp +; CHECK: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_eq_i4(i4 %a, i4 %b) { + %cmp = icmp eq i4 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i8( +; CHECK-NEXT: icmp +; CHECK: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_eq_i8(i8 %a, i8 %b) { + %cmp = icmp eq i8 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i16( +; CHECK-NEXT: icmp +; CHECK: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_eq_i16(i16 %a, i16 %b) { + %cmp = icmp eq i16 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i36( +; CHECK-NEXT: icmp +; CHECK: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_eq_i36(i36 %a, i36 %b) { + %cmp = icmp eq i36 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i128( +; CHECK-NEXT: icmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_eq_i128(i128 %a, i128 %b) { + %cmp = icmp eq i128 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f16( +; CHECK-NEXT: fcmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f16(half %a, half %b) { + %cmp = fcmp oeq half %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f128( +; CHECK-NEXT: fcmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f128(fp128 %a, fp128 %b) { +; + %cmp = fcmp oeq fp128 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i4( +; CHECK-NEXT: icmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_slt_i4(i4 %a, i4 %b) { + %cmp = icmp slt i4 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i8( +; CHECK-NEXT: icmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_slt_i8(i8 %a, i8 %b) { + %cmp = icmp slt i8 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i16( +; CHECK-NEXT: icmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_slt_i16(i16 %a, i16 %b) { + %cmp = icmp slt i16 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i4( +; CHECK-NEXT: icmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_ult_i4(i4 %a, i4 %b) { + %cmp = icmp ult i4 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i8( +; CHECK-NEXT: icmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_ult_i8(i8 %a, i8 %b) { + %cmp = icmp ult i8 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + +; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i16( +; CHECK-NEXT: icmp +; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) +define i64 @fold_icmp_i1_ne_0_icmp_ult_i16(i16 %a, i16 %b) { + %cmp = icmp ult i16 %a, %b + %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33) + ret i64 %mask +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.fcmp ; --------------------------------------------------------------------