Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -638,14 +638,6 @@ /// gen prepare. virtual bool preferZeroCompareBranch() const { return false; } - /// Return true if it is safe to transform an integer-domain bitwise operation - /// into the equivalent floating-point operation. This should be set to true - /// if the target has IEEE-754-compliant fabs/fneg operations for the input - /// type. - virtual bool hasBitPreservingFPLogic(EVT VT) const { - return false; - } - /// Return true if it is cheaper to split the store of a merged int val /// from a pair of smaller values into multiple stores. virtual bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const { Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -579,6 +579,9 @@ SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, unsigned HiOp); SDValue CombineConsecutiveLoads(SDNode *N, EVT VT); + SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI); + SDValue CombineExtLoad(SDNode *N); SDValue CombineZExtLogicopShiftLoad(SDNode *N); SDValue combineRepeatedFPDivisors(SDNode *N); @@ -14114,18 +14117,19 @@ return DAG.getDataLayout().isBigEndian() ? 1 : 0; } -static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, - const TargetLowering &TLI) { +SDValue DAGCombiner::foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI) { // If this is not a bitcast to an FP type or if the target doesn't have // IEEE754-compliant FP logic, we're done. EVT VT = N->getValueType(0); - if (!VT.isFloatingPoint() || !TLI.hasBitPreservingFPLogic(VT)) + SDValue N0 = N->getOperand(0); + EVT SourceVT = N0.getValueType(); + + if (!VT.isFloatingPoint()) return SDValue(); // TODO: Handle cases where the integer constant is a different scalar // bitwidth to the FP. - SDValue N0 = N->getOperand(0); - EVT SourceVT = N0.getValueType(); if (VT.getScalarSizeInBits() != SourceVT.getScalarSizeInBits()) return SDValue(); @@ -14148,6 +14152,9 @@ return SDValue(); } + if (LegalOperations && !TLI.isOperationLegal(FPOpcode, VT)) + return SDValue(); + // This needs to be the inverse of logic in foldSignChangeInBitcast. // FIXME: I don't think looking for bitcast intrinsically makes sense, but // removing this would require more changes. Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -830,11 +830,6 @@ ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator = nullptr) const override; - bool hasBitPreservingFPLogic(EVT VT) const override { - // FIXME: Is this always true? It should be true for vectors at least. - return VT == MVT::f32 || VT == MVT::f64; - } - bool supportSplitCSR(MachineFunction *MF) const override { return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getFunction().hasFnAttribute(Attribute::NoUnwind); Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -396,7 +396,6 @@ EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; - bool hasBitPreservingFPLogic(EVT VT) const override; bool hasAtomicFaddRtnForTy(SDValue &Op) const; bool enableAggressiveFMAFusion(EVT VT) const override; bool enableAggressiveFMAFusion(LLT Ty) const override; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4465,10 +4465,6 @@ } } -bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const { - return isTypeLegal(VT.getScalarType()); -} - bool SITargetLowering::hasAtomicFaddRtnForTy(SDValue &Op) const { switch (Op.getValue(0).getSimpleValueType().SimpleTy) { case MVT::f32: Index: llvm/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1468,7 +1468,6 @@ // tail call. This will cause the optimizers to attempt to move, or // duplicate return instructions to help enable tail call optimizations. bool mayBeEmittedAsTailCall(const CallInst *CI) const override; - bool hasBitPreservingFPLogic(EVT VT) const override; bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; /// getAddrModeForFlags - Based on the set of address flags, select the most Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -17638,15 +17638,6 @@ return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee); } -bool PPCTargetLowering::hasBitPreservingFPLogic(EVT VT) const { - if (!Subtarget.hasVSX()) - return false; - if (Subtarget.hasP9Vector() && VT == MVT::f128) - return true; - return VT == MVT::f32 || VT == MVT::f64 || - VT == MVT::v4f32 || VT == MVT::v2f64; -} - bool PPCTargetLowering:: isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const { const Value *Mask = AndI.getOperand(1); Index: llvm/lib/Target/RISCV/RISCVISelLowering.h =================================================================== --- llvm/lib/Target/RISCV/RISCVISelLowering.h +++ llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -409,7 +409,6 @@ /// should be stack expanded. bool isShuffleMaskLegal(ArrayRef M, EVT VT) const override; - bool hasBitPreservingFPLogic(EVT VT) const override; bool shouldExpandBuildVectorWithShuffles(EVT VT, unsigned DefinedValues) const override; Index: llvm/lib/Target/RISCV/RISCVISelLowering.cpp =================================================================== --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1492,12 +1492,6 @@ return Index == 0 || Index == ResElts; } -bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const { - return (VT == MVT::f16 && Subtarget.hasStdExtZfhOrZfhmin()) || - (VT == MVT::f32 && Subtarget.hasStdExtF()) || - (VT == MVT::f64 && Subtarget.hasStdExtD()); -} - MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -1,3 +1,4 @@ + //===-- SystemZISelLowering.h - SystemZ DAG lowering interface --*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. @@ -423,10 +424,6 @@ } bool isCheapToSpeculateCtlz(Type *) const override { return true; } bool preferZeroCompareBranch() const override { return true; } - bool hasBitPreservingFPLogic(EVT VT) const override { - EVT ScVT = VT.getScalarType(); - return ScVT == MVT::f32 || ScVT == MVT::f64 || ScVT == MVT::f128; - } bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override { ConstantInt* Mask = dyn_cast(AndI.getOperand(1)); return Mask && Mask->getValue().isIntN(16); Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -1082,8 +1082,6 @@ bool isCtlzFast() const override; - bool hasBitPreservingFPLogic(EVT VT) const override; - bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override { // If the pair to store is a mixture of float and int values, we will // save two bitwise instructions and one float-to-int instruction and Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5961,10 +5961,6 @@ return Subtarget.hasLZCNT(); } -bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const { - return VT == MVT::f32 || VT == MVT::f64 || VT.isVector(); -} - bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const { // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more // expensive than a straight movsd. On the other hand, it's important to Index: llvm/test/CodeGen/AMDGPU/fneg.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg.ll +++ llvm/test/CodeGen/AMDGPU/fneg.ll @@ -188,8 +188,8 @@ } ; FUNC-LABEL: {{^}}s_fneg_i16_fp_use: -; SI: v_cvt_f32_f16_e64 [[CVT0:v[0-9]+]], -s{{[0-9]+}} -; SI: v_add_f32_e32 [[ADD:v[0-9]+]], 2.0, [[CVT0]] +; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], s{{[0-9]+}} +; SI: v_sub_f32_e32 [[ADD:v[0-9]+]], 2.0, [[CVT0]] ; SI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[ADD]] ; VI: s_load_dword [[IN:s[0-9]+]] @@ -204,8 +204,8 @@ ; FUNC-LABEL: {{^}}v_fneg_i16_fp_use: ; SI: s_waitcnt -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-NEXT: v_add_f32_e32 v0, 2.0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: s_setpc_b64 ; VI: s_waitcnt @@ -257,8 +257,10 @@ ; FUNC-LABEL: {{^}}s_fneg_v2i16_fp_use: ; SI: s_lshr_b32 s3, s2, 16 -; SI: v_cvt_f32_f16_e64 v0, -s3 -; SI: v_cvt_f32_f16_e64 v1, -s2 +; SI: v_cvt_f32_f16_e32 v0, s3 +; SI: v_cvt_f32_f16_e32 v1, s2 +; SI: v_sub_f32_e32 v0, 2.0, v0 +; SI: v_sub_f32_e32 v1, 2.0, v1 ; VI: s_lshr_b32 s5, s4, 16 ; VI: s_xor_b32 s5, s5, 0x8000 @@ -278,10 +280,10 @@ ; FUNC-LABEL: {{^}}v_fneg_v2i16_fp_use: ; SI: v_lshrrev_b32_e32 v1, 16, v0 -; SI: v_cvt_f32_f16_e64 v0, -v0 -; SI: v_cvt_f32_f16_e64 v1, -v1 -; SI: v_add_f32_e32 v0, 2.0, v0 -; SI: v_add_f32_e32 v1, 2.0, v1 +; SI: v_cvt_f32_f16_e32 v0, v0 +; SI: v_cvt_f32_f16_e32 v1, v1 +; SI: v_sub_f32_e32 v0, 2.0, v0 +; SI: v_sub_f32_e32 v1, 2.0, v1 ; VI: s_waitcnt ; VI: v_mov_b32_e32 v1, 0x4000 Index: llvm/test/CodeGen/PowerPC/fabs.ll =================================================================== --- llvm/test/CodeGen/PowerPC/fabs.ll +++ llvm/test/CodeGen/PowerPC/fabs.ll @@ -13,12 +13,7 @@ define float @bitcast_fabs(float %x) { ; CHECK-LABEL: bitcast_fabs: ; CHECK: # %bb.0: -; CHECK: stfs f1, 8(r1) -; CHECK: lwz r3, 8(r1) -; CHECK-NEXT: clrlwi r3, r3, 1 -; CHECK-NEXT: stw r3, 12(r1) -; CHECK-NEXT: lfs f1, 12(r1) -; CHECK-NEXT: addi r1, r1, 16 +; CHECK-NEXT: fabs f1, f1 ; CHECK-NEXT: blr ; %bc1 = bitcast float %x to i32 Index: llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll =================================================================== --- llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll +++ llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll @@ -211,8 +211,8 @@ define half @bitcast_fabs(half %x) { ; CHECK-LABEL: bitcast_fabs: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast half %x to i16 %and = and i16 %bc1, 32767 @@ -223,8 +223,8 @@ define half @bitcast_fneg(half %x) { ; CHECK-LABEL: bitcast_fneg: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast half %x to i16 %xor = xor i16 %bc1, 32768 @@ -285,8 +285,8 @@ define half @nabs(half %a) { ; CHECK-LABEL: nabs: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %conv = bitcast half %a to i16 %and = or i16 %conv, -32768 Index: llvm/test/CodeGen/X86/fp128-i128.ll =================================================================== --- llvm/test/CodeGen/X86/fp128-i128.ll +++ llvm/test/CodeGen/X86/fp128-i128.ll @@ -130,14 +130,8 @@ define fp128 @TestI128_1(fp128 %x) #0 { ; SSE-LABEL: TestI128_1: ; SSE: # %bb.0: # %entry -; SSE-NEXT: subq $40, %rsp -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF -; SSE-NEXT: andq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, (%rsp) -; SSE-NEXT: movaps (%rsp), %xmm0 +; SSE-NEXT: pushq %rax +; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: callq __lttf2@PLT ; SSE-NEXT: xorl %ecx, %ecx @@ -145,19 +139,13 @@ ; SSE-NEXT: sets %cl ; SSE-NEXT: shlq $4, %rcx ; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rcx), %xmm0 -; SSE-NEXT: addq $40, %rsp +; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; AVX-LABEL: TestI128_1: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) -; AVX-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF -; AVX-NEXT: andq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq %rcx, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %xmm0 +; AVX-NEXT: pushq %rax +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; AVX-NEXT: callq __lttf2@PLT ; AVX-NEXT: xorl %ecx, %ecx @@ -165,7 +153,7 @@ ; AVX-NEXT: sets %cl ; AVX-NEXT: shlq $4, %rcx ; AVX-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rcx), %xmm0 -; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: popq %rax ; AVX-NEXT: retq entry: %0 = bitcast fp128 %x to i128