This is an archive of the discontinued LLVM Phabricator instance.

Improved the interface of methods commuting operands, improved X86-FMA3 mem-folding&coalescing.
ClosedPublic

Authored by v_klochkov on Jul 20 2015, 2:21 PM.

Download Raw Diff

Details

Reviewers

qcolombet
• tstellarAMD
arsenm

Commits

rG16c4da03d5c8: Improved the interface of methods commuting operands, improved X86-FMA3 mem…
rL248735: Improved the interface of methods commuting operands, improved X86-FMA3 mem…

Summary

``The main goal of the change-set is to improve Memory-operand folding
and Coalescing optimizations performed for X86 FMA instructions
(Described in (1) below).

Unfortunately, that could not be done without interface changes
done in methods findCommutedOpIndices() and commuteInstruction().
(Described in (3) below).
The minor changes in non-X86 target sources: PowerPC, ARM and AMDGPU
were required by the new commuteInstruction() method with additional 2 operands
started being called from llvm/lib/CodeGen/* classes common for all targets.

The size of the fix is pretty big because of having (1) and (3) in one
change-set. The alternative to this change-set could be splitting of
the change-set into 2 parts:

interface changes (described in (3) below)
improvement of X86 FMA form selection (described in (1) below).

(1) Implemented optimal form selection (213/312/231) for X86 FMA instructions

   to improve Memory-Folding/Ciscization and Coalescing optimizations performed
   for FMAs. The change-set allows commuting any of FMA operands: 1st and 2nd, 
   1st and 3rd, 2nd and 3rd. 
   Previously, only 1st and 2nd operands could be commuted.
   
   Better Memory-folding and Coalescing optimizations help to reduce 
   registers pressure. Improvement from the changes can be shown on such 
   an example:

           for (int i = 0; i < N; i += 1) {
            val1 = _mm_and_pd(val1, val5);
            val2 = _mm_and_pd(val2, val6);
            val3 = _mm_and_pd(val3, val7);
            val4 = _mm_and_pd(val4, val8);
            val5 = _mm_xor_pd(val1, val5);
            val6 = _mm_xor_pd(val2, val6);
            val7 = _mm_xor_pd(val3, val7);
            val8 = _mm_xor_pd(val4, val8);

            v_accu1 = _mm_fmadd_pd(v_accu1, x1_arr[i], val1);
            v_accu2 = _mm_fmadd_pd(v_accu2, x2_arr[i], val2);
            v_accu3 = _mm_fmadd_pd(v_accu3, x3_arr[i], val3);
            v_accu4 = _mm_fmadd_pd(v_accu4, x4_arr[i], val4);
            v_accu5 = _mm_fmadd_pd(v_accu5, x5_arr[i], val5);
            v_accu6 = _mm_fmadd_pd(v_accu6, x6_arr[i], val6);
            v_accu7 = _mm_fmadd_pd(v_accu7, x7_arr[i], val7);
            v_accu8 = _mm_fmadd_pd(v_accu8, x8_arr[i], val8);
        }
		


    ASM code BEFORE the changes:
        .LBB1_2:                                # %for.body.6
                                        #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vmovapd %xmm0, -56(%rsp)        # 16-byte Spill
        vandpd  %xmm7, %xmm3, %xmm7
        vandpd  %xmm5, %xmm12, %xmm5
        vandpd  %xmm6, %xmm9, %xmm6
        vmovapd -40(%rsp), %xmm10       # 16-byte Reload
        vandpd  %xmm10, %xmm13, %xmm10
        vmovapd %xmm10, -40(%rsp)       # 16-byte Spill
        vxorpd  %xmm7, %xmm3, %xmm3
        vxorpd  %xmm5, %xmm12, %xmm12
        vxorpd  %xmm6, %xmm9, %xmm9
        vxorpd  %xmm10, %xmm13, %xmm13
        vmovapd %xmm8, %xmm0
        vmovapd x1_arr+8192(%rcx), %xmm8
        vmovapd -24(%rsp), %xmm1        # 16-byte Reload
        vfmadd213pd     %xmm7, %xmm8, %xmm1
        vmovapd %xmm1, -24(%rsp)        # 16-byte Spill
        vmovapd %xmm0, %xmm8
        vmovapd x2_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm5, %xmm1, %xmm4
        vmovapd x3_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm6, %xmm1, %xmm8
        vmovapd x4_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm10, %xmm1, %xmm11
        vmovapd -56(%rsp), %xmm0        # 16-byte Reload
        vmovapd x5_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm3, %xmm1, %xmm15
        vmovapd x6_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm12, %xmm1, %xmm0
        vmovapd x7_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm9, %xmm1, %xmm2
        vmovapd x8_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm13, %xmm1, %xmm14
        addq    $16, %rcx
        jne     .LBB1_2

        ASM code WITH the new changes (about 30% faster):
        .LBB1_2:                                # %for.body.6
                                        #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vandpd  %xmm7, %xmm3, %xmm7
        vandpd  %xmm5, %xmm2, %xmm5
        vandpd  %xmm6, %xmm0, %xmm6
        vandpd  %xmm1, %xmm4, %xmm1
        vxorpd  %xmm7, %xmm3, %xmm3
        vxorpd  %xmm5, %xmm2, %xmm2
        vxorpd  %xmm6, %xmm0, %xmm0
        vfmadd132pd     x1_arr+8192(%rcx), %xmm7, %xmm15
        vfmadd132pd     x2_arr+8192(%rcx), %xmm5, %xmm8
        vfmadd132pd     x3_arr+8192(%rcx), %xmm6, %xmm9
        vfmadd132pd     x4_arr+8192(%rcx), %xmm1, %xmm10
        vfmadd132pd     x5_arr+8192(%rcx), %xmm3, %xmm14
        vfmadd132pd     x6_arr+8192(%rcx), %xmm2, %xmm11
        vfmadd132pd     x7_arr+8192(%rcx), %xmm0, %xmm12
        vxorpd  %xmm1, %xmm4, %xmm4
        vfmadd132pd     x8_arr+8192(%rcx), %xmm4, %xmm13
        addq    $16, %rcx
        jne     .LBB1_2

(2) Fixed a correctness problem caused by commuting 1st and 2nd operands of

   scalar FMAs generated for intrinsics. The problem is AUTOMATICALLY/for-free
   gets fixed by the proposed changes for (1).
   
   For FMA intrinsic call:

       __m128d foo(__m128d a, __m128d b, __m128d c) {
	     // must return XMM0={b[127:64], a[63:0]*b[63:0]+c[63:0]}
	     return _mm_fmadd_sd(b, a, c);
	   }

    The Coalescer/TwoAddressInstructionPass swapped the 1st and 2nd operands

of SCALAR FMA and invalidated the higher bits of the result returned
from foo().
The change-set fixes that and prohibits swapping 1st and 2nd operands
of scalar FMAs.

Swapping 1st and 2nd operands of scalar FMAs is possible and legal,
but only after special analysis of FMA users. Such optimization/analysis
can be implemented separately.

(3) The changes performed for (1) and (2) could not be implemented without

   interface change in 2 methods of TargetInstrInfo class and it's child classes:
       
	   bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI,
                                                   unsigned &SrcOpIdx1,
                                                   unsigned &SrcOpIdx2) const;

       The operands SrcOpIdx1 and SrcOpIdx2 used only for OUTPUT from
	   the method previously.
	   Now they are INPUT and OUTPUT arguments.
	   INPUT values specify the indices of operands that are wanted to be swapped.
	   The input value ~0U gives the findCommutedOpIndices freedom to pick 
	   any commutable operand (i.e. defines the _old_ behaviour of the method).
	   
	   MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI,
                                                         bool NewMI,
                                                         unsigned Idx1,
                                                         unsigned Idx2) const;

       Two arguments Idx1 and Idx2 were added to the method; they specify 
	   the operands to be swapped/commuted.
	    
    The old commuteInstruction() method did not let you to ask to commute

1st and 3rd operands or 2nd and 3rd operands.

The changes in TwoAddressInstructionPass.cpp show how the updated methods
can be used to fix missing optimization opportunities that could happen

        previously.

Readability and risky assumptions.
Previously, something similar to this sequence was used in several places:

	    unsigned Idx1;
		unsigned Idx2;
	    if (findCommutedOpIndices(MI, Idx1, Idx2) &&
		    (Idx1 == 1 || Idx2 == 1)) {
	      commuteInstrction(MI, false); //!!! how can we know that Idx1
		                                //    and Idx2 are commuted here?
	      <do something with Idx1 and Idx2 operands here>
		}

The updated functions allow to write more clear, safe and readable code:

        unsigned Idx1 = 1 /*want to commute 1st operand*/;
	    unsigned Idx2 = ~0U /*Don't care, choose any other operand*/;
	        if (findCommutedOpIndices(MI, Idx1, Idx2)) {
	          commuteInstrction(MI, false, Idx1, Idx2);
	          <do something with Idx1 and Idx2 operands here>
			}

The old method commuteInstruction() not specifying the commuted operands
was not removed as removing it would require restructuring/improvements
in some other places (similar to what was done in TwoAddressInstructionPass),
which cannot be done in this change-set as the current version of
the change-set is already too big.
``

Diff Detail

Event Timeline

v_klochkov retitled this revision from to Improved the interface of methods commuting operands, improved X86-FMA3 mem-folding&coalescing..Jul 20 2015, 2:21 PM

v_klochkov updated this object.

v_klochkov updated this revision to Diff 30193.Jul 20 2015, 2:21 PM

Herald added a reviewer: qcolombet. · View Herald TranscriptJul 20 2015, 2:21 PM

Herald added subscribers: MatzeB, aemerson. · View Herald Transcript

v_klochkov updated this object.Jul 20 2015, 2:53 PM

v_klochkov edited edge metadata.

v_klochkov added a subscriber: llvm-commits.

v_klochkov updated this object.Jul 20 2015, 2:56 PM

Hi Slava,

I added some comments, but I'm definitely not a reviewer. I suggest to send a mail to LLVM-dev and tell that you want to improve FMA code and ask people who can review the patch.
Do you have "OK" for the up-streaming?

llvm/lib/Target/X86/X86InstrInfo.cpp
2931	I suggest to change from "Do not call" to "If you call"
2940	Please remove one empty line.
3169	Looks huge. I'm not sure but may be these enums in ABC order and we can compare against first and last? Or auto-generate something?
3378	This method may be static. Right?
3387	I suggest to separate scalar from vector. You handle them separately, right? I also think that you can put all FMA tables in a separate header file.
3546	I think this interface is inconvenient. I suggest to separate input and output. You can put ~0U as default value of input.

Some comments on the proposed interfaces:

llvm/include/llvm/Target/TargetInstrInfo.h
97–109	This is just a helper used to implement some of the other commuting methods. It should not be public and virtual, it doesn't even access any state and could be static or a helper function elsewhere.
274–277	I think just adding Idx1,Idx2 to the existing method should be enough. The two operand commutable case will also work with that and it less confusing than having two overloaded variants around.
289–312	This interface looks complicated, two functions for querying commutability with different "query" styles. I think this should be simplified to a function that simply returns all operands that are commutable. I would imagine something like: bool findCommutedOpIndicess(MachineInstr MI, SmallVectorImpl<unsigned> &CommutableOperandNums) const; or ArrayRef<unsigned> findCommutedOpIndices(MachineInstr MI) const;

Hi Vyacheslav,

I second Matthias' comments.

Just a few more things, see the inline comments.

Thanks,
-Quentin

llvm/include/llvm/Target/TargetInstrInfo.h
100	Please define a static constant for this magic value. I do not like magic values wandering around without context. I do not have a good naming right now though, maybe UndefinedIndex?
289–312	I think for the general case instead of having a list of unsigned, we would need a list of pair of unsigned.

`Elena, Matthias, Quentin,

Thank you for the code-review and comments.
The interface of the functions findCommutedOpIndices() and areOpsCommutable()
caused the biggest questions.

Combining Elena's idea with some of Quentin's comments regarding
the magic consts can give this first alternative:

const unsigned CommuteAnyOperandIndex = ~0U;
bool findCommutedOpIndices(MachineInstr *MI, 
                           unsigned &OpIdx1,
                           unsigned &OpIdx2,
                           unsigned MustCommuteIdx1 = CommuteAnyOperandIndex,
                           unsigned MustCommuteIdx2 = CommuteAnyOperandIndex);

The second alternative is to have a method returning all pairs
of commutable operands:

bool findCommutedOpIndices(MachineInstr *MI, 
                           SmallVectorImpl<_some_type_containing_a_pair_of_unsigned_indices_>);

(I am not sure if I should define some new class/struct or reuse some existing type for <pair of indices>,
but that ok, let this question wait until we can decide what alternative seems better).

I really like Elena's idea; Please let me explain why.
The reason (a) below is the most important one.

a) In many cases we only need to know if some known operand is commutable with others.
   For example, RegisterCoalescer.cpp wants to know if 'UseOpIdx' can be swapped with something else;
   TwoAddressInstructionPass.cpp wants to know if 'BaseOpIdx' is commutable with something else;
   In some other cases we may even know the operands that need to be commuted and do not want
   to know about other operands commutativity.
   
   So, the first alternative provides a flexible instrument that helps to know just what we need to know,
   while the second alternative makes findCommutaleOpIndices() to collect information that often is
   not needed later.
   
   For example, I want to know if 2nd and 3rd operands of FMA are commutable.
   The 1st alternative just gives me the answer:
     if (findCommutedOpIndices(MI, Idx1, Idx2, 2, 3)) {}
   The 2nd alternative would probably return 3 pairs: <1,2>, <1,3>, <2,3>.
   I would not only need to find the desired <2,3> in the set of 3 pairs, but I would
   also ask findCommutedOpIndices() to do potentially expensive analysis regarding commutativity of
   the 1st operand (if FMA is scalar, then 1st operand is commutable only if users use only the lowest 
   element of XMM).
   
b) The 1st alternative helps to make the change-set a bit more compact than it is now,
   while the 2nd would require additional changes that would additionally complicate the places 
   where findCOmmutedOpIndices() is called now.

c) It would be very simple to remove the method areOpsCommutable() if it seems redundant.
   The calls of that method could be easily replaced with the calls of findCommutedOpIndices().
   For example:
     if (areOpsCommutable(MI, 1, 2)) {}
   ->
     if (findCommutedOpIndices(MI, Idx1, Idx2, 1, 2)) {}

Please let me know if you agree with the reasoning and if you are OK with Elena's idea.

Thanks,
Slava`

llvm/lib/Target/X86/X86InstrInfo.cpp
2931	This "Do not call" comment was moved to here from the old version of include/llvm/Target/TargetInstrInfo.h The TargetInstrInfo::commuteInstruction() has assert verifying that MI is commutable. After taking that assert into account this comment seems quite precise.
2940	Ok, removed it, the updated version of the change-set will have this fix.
3169	Unfortunately it is huge, I agree. Comparing against the first and last or having some assumptions about how and in which order the opcodes were defined seems a very risky approach causing unexpected effects/errors in future. I am pretty sure that we should not go this way. I considered the idea of having a special bit for FMAs (something similar to the fields defined in llvm/include/llvm/Target/Target.td: isReturn,isBitcast,etc). Adding isFMA3 to there would be inappropriate as FMA3 is meaningful only for X86, while all other 32 1-bit fields defined there are quite generic and usable for all targets. Also, adding even 1 bit to there will increase the size of IR. Unfortunately, I could not find anything similar but for X86 platform only.
3378	Yes, this method could be static and be similar to existing methods like "static bool isFrameLoadOpcode(int Opcode)", etc. The reason why I passed 'MachineInstruction' argument instead of 'Opcode' to this function and why this method is not static now, is that I wanted to reserve the opportunity to handle SCALAR FMAs and their 1st operand more optimistically later (when additional analysis of scalar FMA users would be implemented); please see the FIXME comment at the line 3487.
3387	There is one loop handling all vector and scalar FMAs below, I did not handle them separately. The 'IsScalar' field was needed only to handle the 1st operand with extra carefulness as commuting 1st operand of scalar FMA requires some additional analysis. Regarding the separating FMA tables into a separate header file... Separating it to a header file makes sense only when it would be used by something else, i.e. not only by one method. Otherwise, it is more convenient to have this array definition closer to the function/method using that table. Also, In my opinion the function local/static array OpcodeAlts is written using the same style that was used in several other places in this file (Please see the definition of MemoryFoldTable2Addr, MemoryFoldTable0, etc). Moving all similar static arrays of structures to a header file deserves a special/separate change-set.
3546	Special thank you for this comment! Separating INPUT and OUTPUT arguments seems very reasonable. I like this idea. In my opinion both approaches have right to live though. Before adding 2 additional arguments to findCommutedOpIndices() and fixing other places I would wait for more comments from reviewers.

So, the first alternative provides a flexible instrument that helps to know just what we need to know, while the second alternative makes findCommutaleOpIndices() to collect information that often is not needed later.

Right, but the first method does not tell the consumer that there are other alternatives for a given operand. The second method is much more general.

That being said, I believe you are right that most users of this method do not care about the alternatives, at least for now, so this is fine to have them iterate on the other indexes to check if there are alternatives. Exactly like you do in the two address pass.

Now, regarding the API, I would be in favor for something simpler, i.e.:

Kill areOpsCommutable.
Keep findCommutableOps with its current signature.

— Just make the two unsigned input/output parameters, like you did.
— Do not add two extra unsigned input parameters.

llvm/lib/CodeGen/TargetInstrInfo.cpp
208	Shouldn’t we just need two default arguments ~0U, instead of duplicating the prototype?
llvm/lib/Target/X86/X86InstrInfo.cpp
3546	I do not see why it is better to separate the input and output parameters here. As long as the parameter will have the value: CommuteAnyOperandIndex, we know how to make the distinction.

Thank you for the answer, Quentin.

I agree with your new comments and started preparing an updated change-set.
Regarding the default ~0U values for arguments of commuteInstruction() method, I still need your opinion - please see my answer to your question ('Inline Comment') for details.

Summarizing the planned additional changes:

remove areOpsCommutable()
fix ~0U magic const
change fixCommutedOpindices() to a helper method.
? remove old commuteInstruction() method and duplicate code handling ~0U args 5 times..., but ONLY if you recommend doing that.

I would also add that if eventually we need a method that would return ALL pairs of commutable operands,
then it could be a NEW method that would have the word 'All' in its name:
... findAllCommutedOpIndices();
Such method (if needed) should be added in a separate change-set though.

Thank you,
Slava

llvm/include/llvm/Target/TargetInstrInfo.h
97–109	Yes, I agree, good catch. This will be fixed.
100	I agree, we need a const for this value. How about 'CommuteAnyOperandIndex' or 'AnyCommutableOperandIndex'?
274–277	Matthias, can you please explain your idea? In particular, what do you mean by saying '...should be enough. The two operand commutable case will also work with that'? I really want to just remove the old method that does not specify the commuted operands (i.e. to do exactly what you recommended here and to just add two operands to existing method). Unfortunately, I cannot do that without rewriting several places in LLVM. This change-set replaces some calls of commuteInstruction(MI) calls with calls of commuteInstruction(MI,false,Idx1,Idx2). That made the changes in those places more clear. There are though 7 or 8 places where the old style method is called and rewriting those places would make this change-set significantly bigger. For example, CodeGen/MachineCSE.cpp has 2 calls of old commuteInstruction() method. That place is obviously has opportunities for improvement, but that should be done in a separate change-set as the current change-set is already very big. So, the only reason for having two variants of commuteInstruction() method is the need to limit the size of the change-set, to limit the efforts needed for code-review, and to eliminate the risks (correctness, etc.) that are brought by too huge change-sets.
llvm/lib/CodeGen/TargetInstrInfo.cpp
208	That could be done this way (i.e. use ~0U value as default values for the indices). This way is good as there will be only one commuteInstruction() method. The only big disadvantage on that way is that I'll need to duplicate the code of this method 5 times: in TargetInstrInfo.cpp (in commuteInstruction() in X86 specific implementation of commuteInstruction() method in PowerPC specific implementation in AMDGPU specific implementation in ARM specific implementation. The duplicated code would be something like this: if (Idx1 == ~0U \|\| Idx2 == ~0U) { if (!findCommutedOpIndices(MI, Idx1, Idx2)) { asssert(...); return nullptr; } } After comparing the advantages and disadvantages... I would prefer to have this code in one place (i.e. as it is implemented now in the 1st change-set), but this question is not something very important for me and I will do as you recommend to do.

Quentin,

I updated the change-set accordingly to the comments and recommendations from reviewers.

In this change-set I did not remove the old commuteInstruction() method as I am waiting for comments from you regarding my comment that says that this method can be removed only with the cost of duplication of the method code 5 times.

Thank you,
Slava

RKSimon added a subscriber: RKSimon.Aug 15 2015, 10:35 AM

Hi Slava,

In this change-set I did not remove the old commuteInstruction() method as I am waiting for comments from you regarding my comment that says that this method can be removed only with the cost of duplication of the method code 5 times.

I think it makes sense to only expose one API and have one overridable internal API. Then put the boring code shared in the public function.
E.g., something like:
class stuff {
protected:

virtual ty PublicAPIImpl(ty2);

public:

/* not virtual */ ty PublicAPI(ty2) {
  /* boring code */
 return PublicAPIImpl(ty2);
}

};

Thanks,
-Quentin

llvm/lib/Target/X86/X86InstrInfo.cpp
3494	My understanding is that you are address this point here: (2) Fixed a correctness problem caused by commuting 1st and 2nd operands of scalar FMAs generated for intrinsics. Most of the time I think we do not care about the high level bits of the value (which is what you are fixing here). Therefore, I wonder if we are not being pessimistic on the commutation opportunities. I agree we should seek correctness first, but I wonder how often that high level setting is actually expected… We had this bug forever and apparently nobody noticed it. Anyway, what is your plan to get us the performance back?

This change-set (3rd revision) is done accordingly to Quentin's suggestion to have 'protected virtual commuteInstructionImpl()' method
and to have the other method 'commuteInstruction()' non-virtual. The last one can accept CommuteAnyOperandIndex arguments.

This solution gives us only one public commuteInstruction() method instead of 2 methods with different interfaces/prototypes as it was in 2nd revision.

Herald added a subscriber: arsenm. · View Herald TranscriptAug 19 2015, 6:18 PM

Hi Quentin,

Thank you for the good idea (commuteInstructionImpl()). Hopefully, I understood it right.
I uploaded a new change-set. Would you please review the additional changes?

Also, I answered your question regarding the stability/correctness fix - please see my answer right after your inline comment/question. In my opinion, stability/correctness has priority over performance in this particular question.
There are at least 2 way how to fix the conservative code-gen.

Thank you,
Slava

llvm/lib/Target/X86/X86InstrInfo.cpp
3494	That correctness problem exists for FMAs and does not exist for ADD/MUL operations. Also, FMAs are relatively new instructions. For example, if you compile the test: #include <immintrin.h> double func(double y, double x) { return y + x; } __m128d funcx(__m128d y, __m128d x) { return _mm_add_sd(x, y); } then you'll see that only 1 instruction is generated for func() and 2 instructions for funcx(). func() just ignores the upper bits of returned XMM and funcx() correctly handles the upper bits of returned XMM value. The difference in IR is: ADDSDrr opcode is used in func(), ADDSDrr_Int opcode is used in funcx(). So, one of possible solutions could be to add _Int opcodes for FMA operations like it was done for ADD and MUL operations, and be more conservative for FMA_Int opcodes only. Another solution is mentioned in FIXME comment above, i.e. to implement functionality that can tell if only the lowest element of the result of scalar FMA is used. In my opinion, these 2 solutions do not exclude each other; they both should be implemented. Currently, we do not have FMA*_Int opcodes, that is why it would be better to be more conservative and correct. This patch might make the code a little bit worse/conservative on some corner cases, but it also improves code-gen for many other cases, for example, for those cases where the 1st or 2nd operand can be swapped with 3rd operand when it helps to do memory-op-folding optimization.

Hi Slava,

I think we are getting close to the final approval :).
Thanks for working on this.

I haven’t looked at the X86 specific part yet, as I would like the patch to be split in two:

One patch for the change in API and NFC for all the backends.
One patch to extend the commute code to handle the FMA cases.

Also, a few general comments:

Some formatting look strange to me, please use clang-format on the patch.
Please do not insert blank comment lines around comments, i.e.,

<— Remove those
<Some comment>
// <— Remove those

Do not repeat the comment from the header in the cpp files.

Thanks,
-Quentin

llvm/include/llvm/Target/TargetInstrInfo.h
269	We can’t override this one anymore, please update the comment.
284–304	It would be useful to specify what is the behavior when both commute indices are CommuteAnyOperandIndex.
304	Same thing: what happens if both indices are set to CommuteAnyOperandIndex?
llvm/lib/CodeGen/TargetInstrInfo.cpp
141	We usually do not repeat the doxygen comment when they are set in the header. By doing so, we risk to have them being out-of-sync. Therefore, please keep only the one in the header. (Ditto for the other functions you’ve modified.)
llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
1167	Please document the meaning of the other arguments as well, like Dist, BaseOpKilled, etc.
1182	Invert the condition and use “continue”. (Per LLVM coding standard.)
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
179	It is not handled at all, right? Shouldn’t findCommutedOpIndices return only register operands?
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
888	I am guessing that you shouldn’t be the one doing this can of changes. Or at least, it should be a separate patch.

I removed the X86 FMA specific changes and left here only the interface changes
for commuteInstruction() and findCommutedOpIndices() methods accordingly to Quentin's request.

Also, removed blank comment lines and duplicated methods' description/comments.

`Hi Quentin,

I appreciate the time you spend to this code-review request.

In this change-set I removed X86 FMA specific changes - that will be a separate change-set
as you asked. This totally excluded the changes from the files:

llvm\test\CodeGen\X86\fma-commute-x86.ll
llvm\test\CodeGen\X86\fma_patterns.ll 
llvm\lib\Target\X86\X86InstrFMA.td

I removed the duplicated comments for methods (header vs cpp)
even though I personally like such duplication.
It is good to have function description in header file, but it is also
so convenient to have it in .cpp when you're looking at the method implementation;
you can see what arguments mean, etc. without need to take a look at *.h file.

AMDGPU changes:

I really did NOT want to do those non-obvious changes for AMDGPU.
Avoiding the changes would let me to avoid questions and to simplify the review/approve process.

Unfortunately, I had to do those changes. 
Also, those changes cannot be separated from the interface changes I did for
commuteInstruction() and findCommutedOpIndices().

The problem I met there can be explained this way:
1) There are some places like this:
     if (MI->IsCommutable() && TII->commuteInstruction(MI)) {}
    I.e. commuteInstruction() was called without preceding call of findCommutedOpIndices().
2) commuteInstruction() implementation for AMDGPU can commute Reg and Imm operands.

So, the solution is:
a) To allow AMDGPU implementation of findCommutedOpIndices() to return true
    when the second operand is Imm.
    Otherwise, all calls mentioned in problem (1) above would not commute instructions.
b) To fix SIFoldOperands.cpp/tryAddToFoldList() and add additional check there because
     Imm operand is not wanted there.
     I updated the FIXME comment there to make it more informative.

Thank you,
Slava
`

Hi Slava,

I still do not get the AMDGPU changes.

Unfortunately, I had to do those changes.
Also, those changes cannot be separated from the interface changes I did for
commuteInstruction() and findCommutedOpIndices().

We didn't change the basic semantic of those APIs, just making them more powerful, right?
So, why do we need to change the way we use them for this target.
Indeed, " if (MI->IsCommutable() && TII->commuteInstruction(MI)) {}" seems like a reasonable pattern to me.

What am I missing?

Thanks,
-Quentin

llvm/lib/CodeGen/TargetInstrInfo.cpp
226	The formatting still looks suspicious to me. I expect the "else" to be on the same line as the '}'. Have you run clang-format?
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
186	Since you said the target can commute imm and reg operand, why do we need this change?
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
797–802	I still don't get why we need to turn this if into an assert.

The code 'if (MI->IsCommutable() && TII->commuteInstruction(MI))' works differently before and after the changes in commuteInstruction() method .

BEFORE:

If MI has the attribute 'MCID::Commutable' set to true, then 
    1) go to AMDGPU specific implementation of commuteInstruction()
        and commute operands (even if those are Reg and Imm).

AFTER:

If MI has the attribute 'MCID::Commutable' set to true, then 
  1) go to TargetInstrInfo::commuteInstruction() and try to commute operands
      1a) call AMDGPU specific implementation of findCommutedOpIndices()
      1b) if could find commutable operands, then call AMDGPU specific implementation of commuteInstruction()

The step 1a) returned false and no operands commute happened, which caused LIT tests fails.
So, I just synchronized understanding of commutable operands in AMDGPU methods findCommutedOpIndices() and commuteInstruction().
The updated findCommutedOpIndices() returns true for Reg and Imm operands if such can be commuted by commuteInstruction().

The fix in SIFoldOperands.cpp was needed because the updated findCommutedOpIndices() may return true for commutable Reg and Imm operands.
In such cases the index of Imm operand is stored into 'FoldList' std::vector object.
Later (downstream of the optimization) the elements of returned FoldList are treated as if they are all REG operands, which causes assert violations for Imm operands stored to 'FoldList'.

I'll fix the mentioned formatting issues, and will try to run format-clang tools.

Thanks,
Slava

llvm/include/llvm/Target/TargetInstrInfo.h
266–270	Fixed.
284–305	Fixed.
llvm/lib/CodeGen/TargetInstrInfo.cpp
141	Ok, removed the duplicated comments/descriptions.
llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
1167	Fixed.
1182	Fixed.
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
179	findCommutedOpIndices() returns commutable operands. They can be Imm operands for AMDGPU target. I also updated the FIXME comment to make it more informative.

Fixed coding style/standard violations such as too long lines, indentations, etc.
using the recommendations from 'clang-format-diff.py' tool.

Please see my answer regarding the new assert in AMDGPU version of commuteInstructionImpl() method.
Thanks,
Slava

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
797–802	(Src1Idx == -1) is impossible here as this place is reachable only after findCommutedOpIndices() call which filters out such situations. This assert for Src1Idx is semantically and stylistically equivalent to the assert at the line 788 (assert for Src0Idx).

Hi Slava,

LGTM.

The code 'if (MI->IsCommutable() && TII->commuteInstruction(MI))' works differently before and after the changes in commuteInstruction() method .

Thanks for the explanation, now I see the difference in semantic. You’ll want to send an email to give a heads-up to out of tree targets for this change.

Also, before landing the patch, ping Tom (thomas.stellard@amd.com, code owner of the AMDGPU), to see if the changes in AMDGPU also look good to him.

Cheers,
-Quentin

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
178–184	The comment is just fine without the FIXME in front of it. Just remove it, i.e., I do not think there is anything to fix here in the end.

This revision is now accepted and ready to land.Sep 4 2015, 4:21 PM

v_klochkov added a reviewer: • tstellarAMD.Sep 17 2015, 11:22 AM

Mr. Stellard,
Please review and approve the changes in 3 files owned by AMDGPU target:

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

All the changes (14 files, including AMDGPU) have been reviewed by Quentin Colombet.

Also, below I attached the e-mails I sent you to your e-mail at amd.com
Please see more details in it.

Thank you,
Vyacheslav Klochkov

From: Klochkov, Vyacheslav N  
Sent: Monday, September 14, 2015 4:46 PM
To: '<EDITED>@amd.com'
Cc: Klochkov, Vyacheslav N
Subject: RE: LLVM code-review: http://reviews.llvm.org/D11370

Mr. Stellard,

In this e-mail I am asking you for approval for AMDGMU specific changes
that are a small part of changes improving methods that commute operands of Machine Instructions.

Please see the details below.
Also, I will be more than happy to answer your questions if you have any.

Thank you,
Vyacheslav Klochkov
------------------------------------------------------------------------------------
From: Klochkov, Vyacheslav N 
Sent: Tuesday, September 8, 2015 4:05 PM
To: <EDITED>@amd.com
Cc: Klochkov, Vyacheslav N
Subject: LLVM code-review: http://reviews.llvm.org/D11370

Dear Mr. Stellard,

Would you please approve the AMDGPU specific changes in this code-review tracker:
http://reviews.llvm.org/D11370

This change-set was reviewed and approved by several people:
-        Quentin Colombet (official code-reviewer).
-        David Kreitzer (code-review before submitting changes to Open Source community);
-        Michael Kuperstein (code-review before submitting changes to community);
-        Matthias Braun and Elena Demikhovsky (not official reviewers, sent some comments).
Quentin recommended to ask for your approval for AMDGPU changes.

This change-set includes changes in 14 files, 3 of 14 are AMDGPU specific:
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

The change-set improves the interface of the findCommutedOpIndices() and commuteInstruction() methods.
It did not change the code-generation for AMDGPU, all LIT tests passed.

The old commuteInstruction() method did not specify the indices of the operands to be commuted.
The main idea of the change-set is that commuteInstruction() must be able to have the commuted operands be specified explicitly.
That is needed when a caller of commuteInstruction() method knows what operands must be commuted.

For example, if 1st and 2nd ops are commutable and 1st and 3rd operands are commutable too,
the old commuteInstruction() method did not allow to do the second commute transformation.
The new method fixes that problem.

It is still possible not to specify the operands to be commuted, but in such cases the operands to be commuted must be found by the method findCommutedOpIndices()
(Please see the commuteInstruction() method in llvm/lib/CodeGen/TargetInstrInfo.cpp).

This interface change in llvm/lib/CodeGen/TargetInstrInfo.cpp caused the need to do minor changes in target specific implementation of
findCommutedOpIndices() and commuteInstruction().

The changes in ARM, X86, PowerPC went very smooth. The changes in AMDGPU required me to do a little bit bigger changes.

For AMDGPU I tuned the findCommutedOpIndices() such a way that now it returns TRUE for commutable Reg and Imm operands
as Reg and Imm operands can be commuted in AMDGPU specific implementation of commuteInstruction() method.
So, findCommutedOpIndices() and commuteInstruction() are in sync now, i.e. the 1st returns true when the 2nd can do the commute.

The change in findCommutedOpIndices() allowed me to add an assert on one of operands in commuteInstructionImpl()  AMDGPU method.
Also, I needed to add a simple check to SIFoldOperands.cpp to avoid Imm operands in the transformation where only Reg operands are expected.

Thank you,
Vyacheslav Klochkov

arsenm added inline comments.Sep 17 2015, 11:44 AM

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
875–876	Typo: immeditate
886–888	It's not entirely accurate to use isVOP2 / isVOP3 for checking the number of operands. VOP* instructions are always available in a VOP3 encoding, but will still have < 3 operands. Checking if AMDGPU::OpName::src2 is a valid operand is a more reliable check. SALU instructions with an immediate can also be commuted, although there is less reason to do so other than canonicalization. Although it looks like isVOP2/isVOP3 is what commuteInstruction already checks so I guess this is OK for now.

arsenm added inline comments.Sep 17 2015, 11:59 AM

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
799–800	I would prefer to deMorgan's law this and distribute the !

Thank you for the comments.
I fixed the "immeditate" misprint,
replaced the if statement: if (!(A && B) && !(C && D)) --> if ((!A || !B) && (!C || !D))
and removed the 'FIXME' word in SIFoldOperands.cpp (accordingly to recommendation from Quentin Colombet).

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
178–184	Fixed. I removed the "FIXME" word.
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
799–800	Ok, Fixed.
875–876	Fixed: immeditate -> immediate.
886–888	Thank you for the explanations. I am quite happy that you are Ok with the current version of the changes as the fixing of such subtle things should be done by AMDGPU experts. In this change-set I just synchronized the checks in findOpIndicesToCommute() and commuteInstructionImpl() (i.e. re-used the checks from commuteInstructionImpl() in findOpIndicesToCommute()).

LGTM

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
780–781	I just removed this check a few days ago, so you probably will have a conflict when you apply this to trunk
801–802	These asserts can be removed. I decided that there's no point to checking if src0/src1 are valid operands

Closed by commit rL248735: Improved the interface of methods commuting operands, improved X86-FMA3 mem… (authored by akaylor). · Explain WhySep 28 2015, 1:35 PM

This revision was automatically updated to reflect the committed changes.

v_klochkov mentioned this in D13269: Improved X86-FMA3 mem-folding & coalescing.Sep 29 2015, 1:50 PM

v_klochkov mentioned this in D13710: New X86 FMA3*_Int opcodes for scalar FMA intrinsics..Oct 13 2015, 4:00 PM

Revision Contents

Path

Size

llvm/

include/

llvm/

Target/

TargetInstrInfo.h

58 lines

lib/

CodeGen/

RegisterCoalescer.cpp

24 lines

TargetInstrInfo.cpp

131 lines

TwoAddressInstructionPass.cpp

101 lines

Target/

AMDGPU/

SIFoldOperands.cpp

12 lines

SIInstrInfo.h

5 lines

SIInstrInfo.cpp

56 lines

SIShrinkInstructions.cpp

6 lines

ARM/

ARMBaseInstrInfo.h

6 lines

ARMBaseInstrInfo.cpp

16 lines

Thumb2SizeReduction.cpp

9 lines

PowerPC/

PPCInstrInfo.h

18 lines

PPCInstrInfo.cpp

26 lines

X86/

X86InstrFMA.td

91 lines

X86InstrInfo.h

79 lines

X86InstrInfo.cpp

545 lines

test/

CodeGen/

X86/

fma-commute-x86.ll

312 lines

fma_patterns.ll

14 lines

Diff 30193

llvm/include/llvm/Target/TargetInstrInfo.h

Show First 20 Lines • Show All 88 Lines • ▼ Show 20 Lines	protected:
/// than producing a value, or if it requres any address registers that are		/// than producing a value, or if it requres any address registers that are
/// not always available.		/// not always available.
/// Requirements must be check as stated in isTriviallyReMaterializable() .		/// Requirements must be check as stated in isTriviallyReMaterializable() .
virtual bool isReallyTriviallyReMaterializable(const MachineInstr *MI,		virtual bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
AliasAnalysis *AA) const {		AliasAnalysis *AA) const {
return false;		return false;
}		}

		/// Assigns the (CommutableOpIdx1, CommutableOpIdx2) pair of commutable
		/// operand indices to (ResultIdx1, ResultIdx2).
		/// One or both input values of the pair: (ResultIdx1, ResultIdx2) may be
		/// predefined to some indices or be undefined (designated by ~0U value).
		qcolombetUnsubmitted Not Done Reply Inline Actions Please define a static constant for this magic value. I do not like magic values wandering around without context. I do not have a good naming right now though, maybe UndefinedIndex? qcolombet: Please define a static constant for this magic value. I do not like magic values wandering…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions I agree, we need a const for this value. How about 'CommuteAnyOperandIndex' or 'AnyCommutableOperandIndex'? v_klochkov: I agree, we need a const for this value. How about 'CommuteAnyOperandIndex' or…
		/// The predefined result indices cannot be re-defined.
		/// The function returns true iff after the result pair redefinition
		/// the fixed result pair is equal to or equivalent to the source pair of
		/// indices: (CommutableOpIdx1, CommutableOpIdx2). It is assumed here that
		/// the pairs (x,y) and (y,x) are equivalent.
		virtual bool fixCommutedOpIndices(unsigned &ResultIdx1,
		unsigned &ResultIdx2,
		unsigned CommutableOpIdx1,
		unsigned CommutableOpIdx2) const;
		MatzeBUnsubmitted Not Done Reply Inline Actions This is just a helper used to implement some of the other commuting methods. It should not be public and virtual, it doesn't even access any state and could be static or a helper function elsewhere. MatzeB: This is just a helper used to implement some of the other commuting methods. It should not be…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Yes, I agree, good catch. This will be fixed. v_klochkov: Yes, I agree, good catch. This will be fixed.

private:		private:
/// For instructions with opcodes for which the M_REMATERIALIZABLE flag is		/// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
/// set and the target hook isReallyTriviallyReMaterializable returns false,		/// set and the target hook isReallyTriviallyReMaterializable returns false,
/// this function does target-independent tests to determine if the		/// this function does target-independent tests to determine if the
/// instruction is really trivially rematerializable.		/// instruction is really trivially rematerializable.
bool isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,		bool isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
AliasAnalysis *AA) const;		AliasAnalysis *AA) const;

▲ Show 20 Lines • Show All 139 Lines • ▼ Show 20 Lines	public:
/// This method returns a null pointer if the transformation cannot be		/// This method returns a null pointer if the transformation cannot be
/// performed, otherwise it returns the last new instruction.		/// performed, otherwise it returns the last new instruction.
///		///
virtual MachineInstr *		virtual MachineInstr *
convertToThreeAddress(MachineFunction::iterator &MFI,		convertToThreeAddress(MachineFunction::iterator &MFI,
MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const {		MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const {
return nullptr;		return nullptr;
}		}

/// If a target has any instructions that are commutable but require		/// If a target has any instructions that are commutable but require
/// converting to different instructions or making non-trivial changes to		/// converting to different instructions or making non-trivial changes to
/// commute them, this method can overloaded to do that.		/// commute them, this method can overloaded to do that.
		qcolombetUnsubmitted Not Done Reply Inline Actions We can’t override this one anymore, please update the comment. qcolombet: We can’t override this one anymore, please update the comment.
/// The default implementation simply swaps the commutable operands.		/// The default implementation simply swaps the commutable operands.
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Fixed. v_klochkov: Fixed.
/// If NewMI is false, MI is modified in place and returned; otherwise, a		/// If NewMI is false, MI is modified in place and returned; otherwise, a
/// new machine instruction is created and returned. Do not call this		/// new machine instruction is created and returned.
/// method for a non-commutable instruction, but there may be some cases		///
/// where this method fails and returns null.		/// The overloaded version of the method with the indices of the
		/// commuted operands may be used when the commuted instruction has
		/// more than two operands and thus, there may be preferences in what
		/// operand must be commuted.
		MatzeBUnsubmitted Not Done Reply Inline Actions I think just adding Idx1,Idx2 to the existing method should be enough. The two operand commutable case will also work with that and it less confusing than having two overloaded variants around. MatzeB: I think just adding Idx1,Idx2 to the existing method should be enough. The two operand…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Matthias, can you please explain your idea? In particular, what do you mean by saying '...should be enough. The two operand commutable case will also work with that'? I really want to just remove the old method that does not specify the commuted operands (i.e. to do exactly what you recommended here and to just add two operands to existing method). Unfortunately, I cannot do that without rewriting several places in LLVM. This change-set replaces some calls of commuteInstruction(MI) calls with calls of commuteInstruction(MI,false,Idx1,Idx2). That made the changes in those places more clear. There are though 7 or 8 places where the old style method is called and rewriting those places would make this change-set significantly bigger. For example, CodeGen/MachineCSE.cpp has 2 calls of old commuteInstruction() method. That place is obviously has opportunities for improvement, but that should be done in a separate change-set as the current change-set is already very big. So, the only reason for having two variants of commuteInstruction() method is the need to limit the size of the change-set, to limit the efforts needed for code-review, and to eliminate the risks (correctness, etc.) that are brought by too huge change-sets. v_klochkov: Matthias, can you please explain your idea? In particular, what do you mean by saying '...
		///
		/// Do not call these methods for a non-commutable instruction.
		/// Even though the instruction is commutable, the method may still
		/// fail to commute the operands, null pointer is returned in such cases.
virtual MachineInstr commuteInstruction(MachineInstr MI,		virtual MachineInstr commuteInstruction(MachineInstr MI,
bool NewMI = false) const;		bool NewMI = false) const;
		virtual MachineInstr commuteInstruction(MachineInstr MI,
/// If specified MI is commutable, return the two operand indices that would		bool NewMI,
/// swap value. Return false if the instruction		unsigned Idx1,
/// is not in a form which this routine understands.		unsigned Idx2) const;

		/// Returns true iff the routine could find two commutable operands in the
		/// given machine instruction.
		/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
		/// input values can be re-defined in this method only if the input values
		/// are not pre-defined, which is designated by the special value ~0U
		/// assigned to it.
		/// If both of indices are pre-defined and refer to some operands, then the
		/// method simply returns true if the corresponding operands are commutable
		/// and returns false otherwise.
		///
		/// For example, calling this method this way:
		/// unsigned Op1 = 1, Op2 = ~0U;
		/// findCommutedOpIndices(MI, Op1, Op2);
		/// can be interpreted as a query asking to find an operand that would be
		/// commutable with the operand#1.
		///
		qcolombetUnsubmitted Not Done Reply Inline Actions It would be useful to specify what is the behavior when both commute indices are CommuteAnyOperandIndex. qcolombet: It would be useful to specify what is the behavior when both commute indices are…
		qcolombetUnsubmitted Not Done Reply Inline Actions Same thing: what happens if both indices are set to CommuteAnyOperandIndex? qcolombet: Same thing: what happens if both indices are set to CommuteAnyOperandIndex?
virtual bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,		virtual bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Fixed. v_klochkov: Fixed.
unsigned &SrcOpIdx2) const;		unsigned &SrcOpIdx2) const;

		/// Returns true if the specified MI is commutable and the operands with
		/// indices SrcOpIdx1 and SrcOpIdx2 can swap their values.
		/// Otherwise, returns false.
		virtual bool areOpsCommutable(MachineInstr *MI, unsigned SrcOpIdx1,
		unsigned SrcOpIdx2) const;
		MatzeBUnsubmitted Not Done Reply Inline Actions This interface looks complicated, two functions for querying commutability with different "query" styles. I think this should be simplified to a function that simply returns all operands that are commutable. I would imagine something like: bool findCommutedOpIndicess(MachineInstr MI, SmallVectorImpl<unsigned> &CommutableOperandNums) const; or ArrayRef<unsigned> findCommutedOpIndices(MachineInstr MI) const; MatzeB: This interface looks complicated, two functions for querying commutability with different…
		qcolombetUnsubmitted Not Done Reply Inline Actions I think for the general case instead of having a list of unsigned, we would need a list of pair of unsigned. qcolombet: I think for the general case instead of having a list of unsigned, we would need a list of pair…

/// A pair composed of a register and a sub-register index.		/// A pair composed of a register and a sub-register index.
/// Used to give some type checking when modeling Reg:SubReg.		/// Used to give some type checking when modeling Reg:SubReg.
struct RegSubRegPair {		struct RegSubRegPair {
unsigned Reg;		unsigned Reg;
unsigned SubReg;		unsigned SubReg;
RegSubRegPair(unsigned Reg = 0, unsigned SubReg = 0)		RegSubRegPair(unsigned Reg = 0, unsigned SubReg = 0)
: Reg(Reg), SubReg(SubReg) {}		: Reg(Reg), SubReg(SubReg) {}
};		};
▲ Show 20 Lines • Show All 994 Lines • Show Last 20 Lines

llvm/lib/CodeGen/RegisterCoalescer.cpp

Show First 20 Lines • Show All 673 Lines • ▼ Show 20 Lines	if (!DefMI->isCommutable())
return false;		return false;
// If DefMI is a two-address instruction then commuting it will change the		// If DefMI is a two-address instruction then commuting it will change the
// destination register.		// destination register.
int DefIdx = DefMI->findRegisterDefOperandIdx(IntA.reg);		int DefIdx = DefMI->findRegisterDefOperandIdx(IntA.reg);
assert(DefIdx != -1);		assert(DefIdx != -1);
unsigned UseOpIdx;		unsigned UseOpIdx;
if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx))		if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx))
return false;		return false;
unsigned Op1, Op2, NewDstIdx;
if (!TII->findCommutedOpIndices(DefMI, Op1, Op2))		//
return false;		// FIXME: The code below tries to commute 'UseOpIdx' operand with some other
if (Op1 == UseOpIdx)		// commutable operand which is expressed by ~0U value passed to the method.
NewDstIdx = Op2;		// That _other_ operand is chosen by the findCommutedOpIndices() method.
else if (Op2 == UseOpIdx)		//
NewDstIdx = Op1;		// That is obviously an area for improvement in case of instructions having
else		// more than 2 operands. For example, if some instruction has 3 commutable
		// operands then all possible variants (i.e. op#1<->op#2, op#1<->op#3,
		// op#2<->op#3) of commute transformation should be considered/tried here.
		//
		unsigned NewDstIdx = ~0U;
		if (!TII->findCommutedOpIndices(DefMI, UseOpIdx, NewDstIdx))
return false;		return false;

MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);		MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
unsigned NewReg = NewDstMO.getReg();		unsigned NewReg = NewDstMO.getReg();
if (NewReg != IntB.reg \|\| !IntB.Query(AValNo->def).isKill())		if (NewReg != IntB.reg \|\| !IntB.Query(AValNo->def).isKill())
return false;		return false;

// Make sure there are no other definitions of IntB that would reach the		// Make sure there are no other definitions of IntB that would reach the
Show All 16 Lines	bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
}		}

DEBUG(dbgs() << "\tremoveCopyByCommutingDef: " << AValNo->def << '\t'		DEBUG(dbgs() << "\tremoveCopyByCommutingDef: " << AValNo->def << '\t'
<< *DefMI);		<< *DefMI);

// At this point we have decided that it is legal to do this		// At this point we have decided that it is legal to do this
// transformation. Start by commuting the instruction.		// transformation. Start by commuting the instruction.
MachineBasicBlock *MBB = DefMI->getParent();		MachineBasicBlock *MBB = DefMI->getParent();
MachineInstr *NewMI = TII->commuteInstruction(DefMI);		MachineInstr *NewMI = TII->commuteInstruction(DefMI, false,
		UseOpIdx, NewDstIdx);
if (!NewMI)		if (!NewMI)
return false;		return false;
if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&		if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&
TargetRegisterInfo::isVirtualRegister(IntB.reg) &&		TargetRegisterInfo::isVirtualRegister(IntB.reg) &&
!MRI->constrainRegClass(IntB.reg, MRI->getRegClass(IntA.reg)))		!MRI->constrainRegClass(IntB.reg, MRI->getRegClass(IntA.reg)))
return false;		return false;
if (NewMI != DefMI) {		if (NewMI != DefMI) {
LIS->ReplaceMachineInstrInMaps(DefMI, NewMI);		LIS->ReplaceMachineInstrInMaps(DefMI, NewMI);
▲ Show 20 Lines • Show All 2,273 Lines • Show Last 20 Lines

llvm/lib/CodeGen/TargetInstrInfo.cpp

Show First 20 Lines • Show All 112 Lines • ▼ Show 20 Lines	TargetInstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,

// If MBB isn't immediately before MBB, insert a branch to it.		// If MBB isn't immediately before MBB, insert a branch to it.
if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(NewDest))		if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(NewDest))
InsertBranch(*MBB, NewDest, nullptr, SmallVector<MachineOperand, 0>(),		InsertBranch(*MBB, NewDest, nullptr, SmallVector<MachineOperand, 0>(),
Tail->getDebugLoc());		Tail->getDebugLoc());
MBB->addSuccessor(NewDest);		MBB->addSuccessor(NewDest);
}		}

// commuteInstruction - The default implementation of this method just exchanges		/// commuteInstruction - If a target has any instructions that are
// the two operands returned by findCommutedOpIndices.		/// commutable but require converting to different instructions or making
		/// non-trivial changes to commute them, these methods can be overloaded to
		/// do that. The default implementations simply swap the commutable
		/// operands.
		///
		/// If NewMI is false, MI is modified in place and returned; otherwise, a
		/// new machine instruction is created and returned.
		///
		/// The passed operand indices are used to tell what operands must
		/// be commuted.
		///
		/// Do not call this method for a non-commutable instruction.
		/// Even though the instruction is commutable, the method may still
		/// fail to commute the operands, null pointer is returned in such cases.
		///
MachineInstr TargetInstrInfo::commuteInstruction(MachineInstr MI,		MachineInstr TargetInstrInfo::commuteInstruction(MachineInstr MI,
bool NewMI) const {		bool NewMI,
		unsigned Idx1,
		unsigned Idx2) const {
const MCInstrDesc &MCID = MI->getDesc();		const MCInstrDesc &MCID = MI->getDesc();
		qcolombetUnsubmitted Not Done Reply Inline Actions We usually do not repeat the doxygen comment when they are set in the header. By doing so, we risk to have them being out-of-sync. Therefore, please keep only the one in the header. (Ditto for the other functions you’ve modified.) qcolombet: We usually do not repeat the doxygen comment when they are set in the header. By doing so, we…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Ok, removed the duplicated comments/descriptions. v_klochkov: Ok, removed the duplicated comments/descriptions.
bool HasDef = MCID.getNumDefs();		bool HasDef = MCID.getNumDefs();
if (HasDef && !MI->getOperand(0).isReg())		if (HasDef && !MI->getOperand(0).isReg())
// No idea how to commute this instruction. Target should implement its own.		// No idea how to commute this instruction. Target should implement its own.
return nullptr;		return nullptr;
unsigned Idx1, Idx2;
if (!findCommutedOpIndices(MI, Idx1, Idx2)) {
assert(MI->isCommutable() && "Precondition violation: MI must be commutable.");
return nullptr;
}

		assert(areOpsCommutable(MI, Idx1, Idx2) &&
		"TargetInstrInfo::CommuteInstruction(): not commutable operands.");
assert(MI->getOperand(Idx1).isReg() && MI->getOperand(Idx2).isReg() &&		assert(MI->getOperand(Idx1).isReg() && MI->getOperand(Idx2).isReg() &&
"This only knows how to commute register operands so far");		"This only knows how to commute register operands so far");

unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;		unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
unsigned Reg1 = MI->getOperand(Idx1).getReg();		unsigned Reg1 = MI->getOperand(Idx1).getReg();
unsigned Reg2 = MI->getOperand(Idx2).getReg();		unsigned Reg2 = MI->getOperand(Idx2).getReg();
unsigned SubReg0 = HasDef ? MI->getOperand(0).getSubReg() : 0;		unsigned SubReg0 = HasDef ? MI->getOperand(0).getSubReg() : 0;
unsigned SubReg1 = MI->getOperand(Idx1).getSubReg();		unsigned SubReg1 = MI->getOperand(Idx1).getSubReg();
unsigned SubReg2 = MI->getOperand(Idx2).getSubReg();		unsigned SubReg2 = MI->getOperand(Idx2).getSubReg();
bool Reg1IsKill = MI->getOperand(Idx1).isKill();		bool Reg1IsKill = MI->getOperand(Idx1).isKill();
bool Reg2IsKill = MI->getOperand(Idx2).isKill();		bool Reg2IsKill = MI->getOperand(Idx2).isKill();
Show All 33 Lines	MachineInstr TargetInstrInfo::commuteInstruction(MachineInstr MI,
MI->getOperand(Idx1).setIsKill(Reg2IsKill);		MI->getOperand(Idx1).setIsKill(Reg2IsKill);
MI->getOperand(Idx2).setIsUndef(Reg1IsUndef);		MI->getOperand(Idx2).setIsUndef(Reg1IsUndef);
MI->getOperand(Idx1).setIsUndef(Reg2IsUndef);		MI->getOperand(Idx1).setIsUndef(Reg2IsUndef);
MI->getOperand(Idx2).setIsInternalRead(Reg1IsInternal);		MI->getOperand(Idx2).setIsInternalRead(Reg1IsInternal);
MI->getOperand(Idx1).setIsInternalRead(Reg2IsInternal);		MI->getOperand(Idx1).setIsInternalRead(Reg2IsInternal);
return MI;		return MI;
}		}

/// findCommutedOpIndices - If specified MI is commutable, return the two		/// The default implementation of this method just exchanges the two operands.
/// operand indices that would swap value. Return true if the instruction		/// This method is used when the caller does not care about what operands
/// is not in a form which this routine understands.		/// should be commuted or when there is only one way of doing operands commute
		/// transformation, for example, when the commuted instruction has only
		/// 2 operands.
		///
		MachineInstr TargetInstrInfo::commuteInstruction(MachineInstr MI,
		bool NewMI) const {
		qcolombetUnsubmitted Not Done Reply Inline Actions Shouldn’t we just need two default arguments ~0U, instead of duplicating the prototype? qcolombet: Shouldn’t we just need two default arguments ~0U, instead of duplicating the prototype?
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions That could be done this way (i.e. use ~0U value as default values for the indices). This way is good as there will be only one commuteInstruction() method. The only big disadvantage on that way is that I'll need to duplicate the code of this method 5 times: in TargetInstrInfo.cpp (in commuteInstruction() in X86 specific implementation of commuteInstruction() method in PowerPC specific implementation in AMDGPU specific implementation in ARM specific implementation. The duplicated code would be something like this: if (Idx1 == ~0U \|\| Idx2 == ~0U) { if (!findCommutedOpIndices(MI, Idx1, Idx2)) { asssert(...); return nullptr; } } After comparing the advantages and disadvantages... I would prefer to have this code in one place (i.e. as it is implemented now in the 1st change-set), but this question is not something very important for me and I will do as you recommend to do. v_klochkov: That could be done this way (i.e. use ~0U value as default values for the indices). This way…
		unsigned OpIdx1 = ~0U, OpIdx2 = ~0U;

		if (!findCommutedOpIndices(MI, OpIdx1, OpIdx2)) {
		assert(MI->isCommutable() &&
		"Precondition violation: MI must be commutable.");
		return nullptr;
		}
		return commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
		}

		/// Assigns the (CommutableOpIdx1, CommutableOpIdx2) pair of commutable
		/// operand indices to (ResultIdx1, ResultIdx2).
		/// One or both input values of the pair: (ResultIdx1, ResultIdx2) may be
		/// predefined to some indices or be undefined (designated by ~0U value).
		/// The predefined result indices cannot be re-defined.
		/// The function returns true iff after the result pair redefinition
		/// the fixed result pair is equal to or equivalent to the source pair of
		/// indices: (CommutableOpIdx1, CommutableOpIdx2). It is assumed here that
		qcolombetUnsubmitted Not Done Reply Inline Actions The formatting still looks suspicious to me. I expect the "else" to be on the same line as the '}'. Have you run clang-format? qcolombet: The formatting still looks suspicious to me. I expect the "else" to be on the same line as the…
		/// the pairs (x,y) and (y,x) are equivalent.
		///
		bool TargetInstrInfo::fixCommutedOpIndices(unsigned &ResultIdx1,
		unsigned &ResultIdx2,
		unsigned CommutableOpIdx1,
		unsigned CommutableOpIdx2) const {
		if (ResultIdx1 == ~0U && ResultIdx2 == ~0U) {
		ResultIdx1 = CommutableOpIdx1;
		ResultIdx2 = CommutableOpIdx2;
		}
		else if (ResultIdx1 == ~0U) {
		if (ResultIdx2 == CommutableOpIdx1)
		ResultIdx1 = CommutableOpIdx2;
		else if (ResultIdx2 == CommutableOpIdx2)
		ResultIdx1 = CommutableOpIdx1;
		else
		return false;
		}
		else if (ResultIdx2 == ~0U) {
		if (ResultIdx1 == CommutableOpIdx1)
		ResultIdx2 = CommutableOpIdx2;
		else if (ResultIdx1 == CommutableOpIdx2)
		ResultIdx2 = CommutableOpIdx1;
		else
		return false;
		}
		else
		// Check that the result operand indices match the given commutable
		// operand indices.
		return (ResultIdx1 == CommutableOpIdx1 && ResultIdx2 == CommutableOpIdx2) \|\|
		(ResultIdx1 == CommutableOpIdx2 && ResultIdx2 == CommutableOpIdx1);

		return true;
		}

		/// Returns true iff the routine could find two commutable operands in the
		/// given machine instruction.
		/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
		/// input values can be re-defined in this method only if the input values
		/// are not pre-defined, which is designated by the special value ~0U
		/// assigned to it.
		/// If both of indices are pre-defined and refer to some operands, then the
		/// method simply returns true if the corresponding operands are commutable
		/// and returns false otherwise.
		///
		/// For example, calling this method this way:
		/// unsigned Op1 = 1, Op2 = ~0U;
		/// findCommutedOpIndices(MI, Op1, Op2);
		/// can be interpreted as a query asking to find an operand that would be
		/// commutable with the operand#1.
		///
bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI,		bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI,
unsigned &SrcOpIdx1,		unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {		unsigned &SrcOpIdx2) const {
assert(!MI->isBundle() &&		assert(!MI->isBundle() &&
"TargetInstrInfo::findCommutedOpIndices() can't handle bundles");		"TargetInstrInfo::findCommutedOpIndices() can't handle bundles");

const MCInstrDesc &MCID = MI->getDesc();		const MCInstrDesc &MCID = MI->getDesc();
if (!MCID.isCommutable())		if (!MCID.isCommutable())
return false;		return false;

// This assumes v0 = op v1, v2 and commuting would swap v1 and v2. If this		// This assumes v0 = op v1, v2 and commuting would swap v1 and v2. If this
// is not true, then the target must implement this.		// is not true, then the target must implement this.
SrcOpIdx1 = MCID.getNumDefs();		unsigned CommutableOpIdx1 = MCID.getNumDefs();
SrcOpIdx2 = SrcOpIdx1 + 1;		unsigned CommutableOpIdx2 = CommutableOpIdx1 + 1;
		if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
		CommutableOpIdx1, CommutableOpIdx2))
		return false;

if (!MI->getOperand(SrcOpIdx1).isReg() \|\|		if (!MI->getOperand(SrcOpIdx1).isReg() \|\|
!MI->getOperand(SrcOpIdx2).isReg())		!MI->getOperand(SrcOpIdx2).isReg())
// No idea.		// No idea.
return false;		return false;
return true;		return true;
}		}

		/// Returns true if the specified MI is commutable and the operands with
		/// indices SrcOpIdx1 and SrcOpIdx2 can swap their values.
		/// Otherwise, returns false.
		bool TargetInstrInfo::areOpsCommutable(MachineInstr *MI,
		unsigned SrcOpIdx1,
		unsigned SrcOpIdx2) const {
		unsigned OpsNum = MI->getNumOperands();
		assert(SrcOpIdx1 < OpsNum && SrcOpIdx2 < OpsNum &&
		"TargetInstrInfo::areOpsCommutable() illegal operand index.");

		return findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
		}

bool		bool
TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {		TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
if (!MI->isTerminator()) return false;		if (!MI->isTerminator()) return false;

// Conditional branch is a special case.		// Conditional branch is a special case.
if (MI->isBranch() && !MI->isBarrier())		if (MI->isBranch() && !MI->isBarrier())
return true;		return true;
▲ Show 20 Lines • Show All 736 Lines • Show Last 20 Lines

llvm/lib/CodeGen/TwoAddressInstructionPass.cpp

Show First 20 Lines • Show All 104 Lines • ▼ Show 20 Lines	class TwoAddressInstructionPass : public MachineFunctionPass {

bool isRevCopyChain(unsigned FromReg, unsigned ToReg, int Maxlen);		bool isRevCopyChain(unsigned FromReg, unsigned ToReg, int Maxlen);

bool noUseAfterLastDef(unsigned Reg, unsigned Dist, unsigned &LastDef);		bool noUseAfterLastDef(unsigned Reg, unsigned Dist, unsigned &LastDef);

bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,		bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
MachineInstr *MI, unsigned Dist);		MachineInstr *MI, unsigned Dist);

bool commuteInstruction(MachineBasicBlock::iterator &mi,		bool commuteInstruction(MachineInstr *MI,
unsigned RegB, unsigned RegC, unsigned Dist);		unsigned RegBIdx, unsigned RegCIdx, unsigned Dist);

bool isProfitableToConv3Addr(unsigned RegA, unsigned RegB);		bool isProfitableToConv3Addr(unsigned RegA, unsigned RegB);

bool convertInstTo3Addr(MachineBasicBlock::iterator &mi,		bool convertInstTo3Addr(MachineBasicBlock::iterator &mi,
MachineBasicBlock::iterator &nmi,		MachineBasicBlock::iterator &nmi,
unsigned RegA, unsigned RegB, unsigned Dist);		unsigned RegA, unsigned RegB, unsigned Dist);

bool isDefTooClose(unsigned Reg, unsigned Dist, MachineInstr *MI);		bool isDefTooClose(unsigned Reg, unsigned Dist, MachineInstr *MI);

bool rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,		bool rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
MachineBasicBlock::iterator &nmi,		MachineBasicBlock::iterator &nmi,
unsigned Reg);		unsigned Reg);
bool rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,		bool rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
MachineBasicBlock::iterator &nmi,		MachineBasicBlock::iterator &nmi,
unsigned Reg);		unsigned Reg);

bool tryInstructionTransform(MachineBasicBlock::iterator &mi,		bool tryInstructionTransform(MachineBasicBlock::iterator &mi,
MachineBasicBlock::iterator &nmi,		MachineBasicBlock::iterator &nmi,
unsigned SrcIdx, unsigned DstIdx,		unsigned SrcIdx, unsigned DstIdx,
unsigned Dist, bool shouldOnlyCommute);		unsigned Dist, bool shouldOnlyCommute);

		bool tryInstructionCommute(MachineInstr *MI,
		unsigned DstOpIdx,
		unsigned BaseOpIdx,
		bool BaseOpKilled,
		unsigned Dist);
void scanUses(unsigned DstReg);		void scanUses(unsigned DstReg);

void processCopy(MachineInstr *MI);		void processCopy(MachineInstr *MI);

typedef SmallVector<std::pair<unsigned, unsigned>, 4> TiedPairList;		typedef SmallVector<std::pair<unsigned, unsigned>, 4> TiedPairList;
typedef SmallDenseMap<unsigned, TiedPairList> TiedOperandMap;		typedef SmallDenseMap<unsigned, TiedPairList> TiedOperandMap;
bool collectTiedOperands(MachineInstr *MI, TiedOperandMap&);		bool collectTiedOperands(MachineInstr *MI, TiedOperandMap&);
void processTiedPairs(MachineInstr *MI, TiedPairList&, unsigned &Dist);		void processTiedPairs(MachineInstr *MI, TiedPairList&, unsigned &Dist);
▲ Show 20 Lines • Show All 497 Lines • ▼ Show 20 Lines	isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
// if the def of regC is closer. Its live interval is shorter.		// if the def of regC is closer. Its live interval is shorter.
return LastDefB && LastDefC && LastDefC > LastDefB;		return LastDefB && LastDefC && LastDefC > LastDefB;
}		}

/// commuteInstruction - Commute a two-address instruction and update the basic		/// commuteInstruction - Commute a two-address instruction and update the basic
/// block, distance map, and live variables if needed. Return true if it is		/// block, distance map, and live variables if needed. Return true if it is
/// successful.		/// successful.
bool TwoAddressInstructionPass::		bool TwoAddressInstructionPass::
commuteInstruction(MachineBasicBlock::iterator &mi,		commuteInstruction(MachineInstr *MI,
unsigned RegB, unsigned RegC, unsigned Dist) {		unsigned RegBIdx, unsigned RegCIdx, unsigned Dist) {
MachineInstr *MI = mi;		unsigned RegC = MI->getOperand(RegCIdx).getReg();
DEBUG(dbgs() << "2addr: COMMUTING : " << *MI);		DEBUG(dbgs() << "2addr: COMMUTING : " << *MI);
MachineInstr *NewMI = TII->commuteInstruction(MI);		MachineInstr *NewMI = TII->commuteInstruction(MI, false, RegBIdx, RegCIdx);

if (NewMI == nullptr) {		if (NewMI == nullptr) {
DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n");		DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n");
return false;		return false;
}		}

DEBUG(dbgs() << "2addr: COMMUTED TO: " << *NewMI);		DEBUG(dbgs() << "2addr: COMMUTED TO: " << *NewMI);
assert(NewMI == MI &&		assert(NewMI == MI &&
▲ Show 20 Lines • Show All 488 Lines • ▼ Show 20 Lines	if (LIS) {
LV->removeVirtualRegisterKilled(Reg, KillMI);		LV->removeVirtualRegisterKilled(Reg, KillMI);
LV->addVirtualRegisterKilled(Reg, MI);		LV->addVirtualRegisterKilled(Reg, MI);
}		}

DEBUG(dbgs() << "\trescheduled kill: " << *KillMI);		DEBUG(dbgs() << "\trescheduled kill: " << *KillMI);
return true;		return true;
}		}

		/// Tries to commute the operand BaseOpIdx and some other operand in the given
		/// machine instruction to improve opportunities for coalescing and elimination
		/// of a register to register copy.
		/// Returns true if the transformation happened. Otherwise, returns false.
		///
		qcolombetUnsubmitted Not Done Reply Inline Actions Please document the meaning of the other arguments as well, like Dist, BaseOpKilled, etc. qcolombet: Please document the meaning of the other arguments as well, like Dist, BaseOpKilled, etc.
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Fixed. v_klochkov: Fixed.
		bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
		unsigned DstOpIdx,
		unsigned BaseOpIdx,
		bool BaseOpKilled,
		unsigned Dist) {
		unsigned OtherOpIdx = MI->getDesc().getNumDefs();
		for (; OtherOpIdx < MI->getDesc().getNumOperands(); OtherOpIdx++) {
		if (OtherOpIdx != BaseOpIdx &&
		TII->areOpsCommutable(MI, BaseOpIdx, OtherOpIdx)) {

		unsigned DstOpReg = MI->getOperand(DstOpIdx).getReg();
		unsigned BaseOpReg = MI->getOperand(BaseOpIdx).getReg();
		unsigned OtherOpReg = MI->getOperand(OtherOpIdx).getReg();
		bool AggressiveCommute = false;

		qcolombetUnsubmitted Not Done Reply Inline Actions Invert the condition and use “continue”. (Per LLVM coding standard.) qcolombet: Invert the condition and use “continue”. (Per LLVM coding standard.)
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Fixed. v_klochkov: Fixed.
		// If OtherOp dies but BaseOp does not, swap the OtherOp and BaseOp
		// operands. This makes the live ranges of DstOp and OtherOp joinable.
		bool DoCommute =
		!BaseOpKilled && isKilled(*MI, OtherOpReg, MRI, TII, LIS, false);

		if (!DoCommute &&
		isProfitableToCommute(DstOpReg, BaseOpReg, OtherOpReg, MI, Dist)) {
		DoCommute = true;
		AggressiveCommute = true;
		}

		//
		// If it's profitable to commute, try to do so.
		//
		if (DoCommute && commuteInstruction(MI, BaseOpIdx, OtherOpIdx, Dist)) {
		++NumCommuted;
		if (AggressiveCommute)
		++NumAggrCommuted;
		return true;
		}
		}
		}
		return false;
		}

/// tryInstructionTransform - For the case where an instruction has a single		/// tryInstructionTransform - For the case where an instruction has a single
/// pair of tied register operands, attempt some transformations that may		/// pair of tied register operands, attempt some transformations that may
/// either eliminate the tied operands or improve the opportunities for		/// either eliminate the tied operands or improve the opportunities for
/// coalescing away the register copy. Returns true if no copy needs to be		/// coalescing away the register copy. Returns true if no copy needs to be
/// inserted to untie mi's operands (either because they were untied, or		/// inserted to untie mi's operands (either because they were untied, or
/// because mi was rescheduled, and will be visited again later). If the		/// because mi was rescheduled, and will be visited again later). If the
/// shouldOnlyCommute flag is true, only instruction commutation is attempted.		/// shouldOnlyCommute flag is true, only instruction commutation is attempted.
bool TwoAddressInstructionPass::		bool TwoAddressInstructionPass::
Show All 10 Lines	tryInstructionTransform(MachineBasicBlock::iterator &mi,

assert(TargetRegisterInfo::isVirtualRegister(regB) &&		assert(TargetRegisterInfo::isVirtualRegister(regB) &&
"cannot make instruction into two-address form");		"cannot make instruction into two-address form");
bool regBKilled = isKilled(MI, regB, MRI, TII, LIS, true);		bool regBKilled = isKilled(MI, regB, MRI, TII, LIS, true);

if (TargetRegisterInfo::isVirtualRegister(regA))		if (TargetRegisterInfo::isVirtualRegister(regA))
scanUses(regA);		scanUses(regA);

// Check if it is profitable to commute the operands.		bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
unsigned SrcOp1, SrcOp2;
unsigned regC = 0;
unsigned regCIdx = ~0U;
bool TryCommute = false;
bool AggressiveCommute = false;
if (MI.isCommutable() && MI.getNumOperands() >= 3 &&
TII->findCommutedOpIndices(&MI, SrcOp1, SrcOp2)) {
if (SrcIdx == SrcOp1)
regCIdx = SrcOp2;
else if (SrcIdx == SrcOp2)
regCIdx = SrcOp1;

if (regCIdx != ~0U) {
regC = MI.getOperand(regCIdx).getReg();
if (!regBKilled && isKilled(MI, regC, MRI, TII, LIS, false))
// If C dies but B does not, swap the B and C operands.
// This makes the live ranges of A and C joinable.
TryCommute = true;
else if (isProfitableToCommute(regA, regB, regC, &MI, Dist)) {
TryCommute = true;
AggressiveCommute = true;
}
}
}

// If the instruction is convertible to 3 Addr, instead		// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggresively and		// of returning try 3 Addr transformation aggresively and
// use this variable to check later. Because it might be better.		// use this variable to check later. Because it might be better.
// For example, we can just use `leal (%rsi,%rdi), %eax` and `ret`		// For example, we can just use `leal (%rsi,%rdi), %eax` and `ret`
// instead of the following code.		// instead of the following code.
// addl %esi, %edi		// addl %esi, %edi
// movl %edi, %eax		// movl %edi, %eax
// ret		// ret
bool Commuted = false;		if (Commuted && !MI.isConvertibleTo3Addr())

// If it's profitable to commute, try to do so.
if (TryCommute && commuteInstruction(mi, regB, regC, Dist)) {
Commuted = true;
++NumCommuted;
if (AggressiveCommute)
++NumAggrCommuted;
if (!MI.isConvertibleTo3Addr())
return false;		return false;
}

if (shouldOnlyCommute)		if (shouldOnlyCommute)
return false;		return false;

// If there is one more use of regB later in the same MBB, consider		// If there is one more use of regB later in the same MBB, consider
// re-schedule this MI below it.		// re-schedule this MI below it.
if (!Commuted && EnableRescheduling && rescheduleMIBelowKill(mi, nmi, regB)) {		if (!Commuted && EnableRescheduling && rescheduleMIBelowKill(mi, nmi, regB)) {
++NumReSchedDowns;		++NumReSchedDowns;
▲ Show 20 Lines • Show All 559 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Show First 20 Lines • Show All 158 Lines • ▼ Show 20 Lines	if (!TII->isOperandLegal(MI, OpNo, OpToFold)) {
// If we are already folding into another operand of MI, then		// If we are already folding into another operand of MI, then
// we can't commute the instruction, otherwise we risk making the		// we can't commute the instruction, otherwise we risk making the
// other fold illegal.		// other fold illegal.
if (isUseMIInFoldList(FoldList, MI))		if (isUseMIInFoldList(FoldList, MI))
return false;		return false;

// Operand is not legal, so try to commute the instruction to		// Operand is not legal, so try to commute the instruction to
// see if this makes it possible to fold.		// see if this makes it possible to fold.
unsigned CommuteIdx0;		unsigned CommuteIdx0 = ~0U;
unsigned CommuteIdx1;		unsigned CommuteIdx1 = ~0U;
bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);		bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);

if (CanCommute) {		if (CanCommute) {
if (CommuteIdx0 == OpNo)		if (CommuteIdx0 == OpNo)
OpNo = CommuteIdx1;		OpNo = CommuteIdx1;
else if (CommuteIdx1 == OpNo)		else if (CommuteIdx1 == OpNo)
OpNo = CommuteIdx0;		OpNo = CommuteIdx0;
}		}

if (!CanCommute \|\| !TII->commuteInstruction(MI))		// FIXME: OpNo can be commuted with non-reg operand OtherOpNo, but
		// such test cases are not handled well yet.
		qcolombetUnsubmitted Not Done Reply Inline Actions It is not handled at all, right? Shouldn’t findCommutedOpIndices return only register operands? qcolombet: It is not handled at all, right? Shouldn’t findCommutedOpIndices return only register operands?
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions findCommutedOpIndices() returns commutable operands. They can be Imm operands for AMDGPU target. I also updated the FIXME comment to make it more informative. v_klochkov: findCommutedOpIndices() returns commutable operands. They can be Imm operands for AMDGPU target.
		if (CanCommute &&
		(!MI->getOperand(CommuteIdx0).isReg() \|\| !MI->getOperand(CommuteIdx1).isReg()))
		return false;

		if (!CanCommute \|\| !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1))
		qcolombetUnsubmitted Not Done Reply Inline Actions The comment is just fine without the FIXME in front of it. Just remove it, i.e., I do not think there is anything to fix here in the end. qcolombet: The comment is just fine without the FIXME in front of it. Just remove it, i.e., I do not think…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Fixed. I removed the "FIXME" word. v_klochkov: Fixed. I removed the "FIXME" word.
return false;		return false;

		qcolombetUnsubmitted Not Done Reply Inline Actions Since you said the target can commute imm and reg operand, why do we need this change? qcolombet: Since you said the target can commute imm and reg operand, why do we need this change?
if (!TII->isOperandLegal(MI, OpNo, OpToFold))		if (!TII->isOperandLegal(MI, OpNo, OpToFold))
return false;		return false;
}		}

FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));		FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
return true;		return true;
}		}

▲ Show 20 Lines • Show All 131 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Show First 20 Lines • Show All 114 Lines • ▼ Show 20 Lines	public:

// \brief Returns an opcode that can be used to move a value to a \p DstRC		// \brief Returns an opcode that can be used to move a value to a \p DstRC
// register. If there is no hardware instruction that can store to \p		// register. If there is no hardware instruction that can store to \p
// DstRC, then AMDGPU::COPY is returned.		// DstRC, then AMDGPU::COPY is returned.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;		unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
int commuteOpcode(const MachineInstr &MI) const;		int commuteOpcode(const MachineInstr &MI) const;

MachineInstr commuteInstruction(MachineInstr MI,		MachineInstr commuteInstruction(MachineInstr MI,
bool NewMI = false) const override;		bool NewMI,
		unsigned OpIdx0,
		unsigned OpIdx1) const override;

bool findCommutedOpIndices(MachineInstr *MI,		bool findCommutedOpIndices(MachineInstr *MI,
unsigned &SrcOpIdx1,		unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;		unsigned &SrcOpIdx2) const override;

bool isTriviallyReMaterializable(const MachineInstr *MI,		bool isTriviallyReMaterializable(const MachineInstr *MI,
AliasAnalysis *AA = nullptr) const;		AliasAnalysis *AA = nullptr) const;

bool areMemAccessesTriviallyDisjoint(		bool areMemAccessesTriviallyDisjoint(
▲ Show 20 Lines • Show All 264 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 758 Lines • ▼ Show 20 Lines	BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
.addOperand(SrcCond);		.addOperand(SrcCond);
MI->eraseFromParent();		MI->eraseFromParent();
break;		break;
}		}
}		}
return true;		return true;
}		}

		/// Commutes the operands in the given instruction.
		/// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
		///
		/// Do not call this method for a non-commutable instruction or for
		/// non-commutable pair of operand indices OpIdx1 and OpIdx2.
		/// Even though the instruction is commutable, the method may still
		/// fail to commute the operands, null pointer is returned in such cases.
		///
MachineInstr SIInstrInfo::commuteInstruction(MachineInstr MI,		MachineInstr SIInstrInfo::commuteInstruction(MachineInstr MI,
bool NewMI) const {		bool NewMI,
		unsigned OpIdx0,
		unsigned OpIdx1) const {

if (MI->getNumOperands() < 3)		if (MI->getNumOperands() < 3)
return nullptr;		return nullptr;
		arsenmUnsubmitted Not Done Reply Inline Actions I just removed this check a few days ago, so you probably will have a conflict when you apply this to trunk arsenm: I just removed this check a few days ago, so you probably will have a conflict when you apply…

int CommutedOpcode = commuteOpcode(*MI);		int CommutedOpcode = commuteOpcode(*MI);
if (CommutedOpcode == -1)		if (CommutedOpcode == -1)
return nullptr;		return nullptr;

int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),		int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::src0);		AMDGPU::OpName::src0);
assert(Src0Idx != -1 && "Should always have src0 operand");		assert(Src0Idx != -1 && "Should always have src0 operand");

MachineOperand &Src0 = MI->getOperand(Src0Idx);		MachineOperand &Src0 = MI->getOperand(Src0Idx);
if (!Src0.isReg())		if (!Src0.isReg())
return nullptr;		return nullptr;

int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),		int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::src1);		AMDGPU::OpName::src1);
if (Src1Idx == -1)		assert(Src1Idx != -1 && "Should always have src1 operand");

		if (!(OpIdx0 == static_cast<unsigned>(Src0Idx) &&
		OpIdx1 == static_cast<unsigned>(Src1Idx)) &&
		arsenmUnsubmitted Not Done Reply Inline Actions I would prefer to deMorgan's law this and distribute the ! arsenm: I would prefer to deMorgan's law this and distribute the !
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Ok, Fixed. v_klochkov: Ok, Fixed.
		!(OpIdx0 == static_cast<unsigned>(Src1Idx) &&
		OpIdx1 == static_cast<unsigned>(Src0Idx)))
		qcolombetUnsubmitted Not Done Reply Inline Actions I still don't get why we need to turn this if into an assert. qcolombet: I still don't get why we need to turn this if into an assert.
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions (Src1Idx == -1) is impossible here as this place is reachable only after findCommutedOpIndices() call which filters out such situations. This assert for Src1Idx is semantically and stylistically equivalent to the assert at the line 788 (assert for Src0Idx). v_klochkov: (Src1Idx == -1) is impossible here as this place is reachable only after findCommutedOpIndices…
		arsenmUnsubmitted Not Done Reply Inline Actions These asserts can be removed. I decided that there's no point to checking if src0/src1 are valid operands arsenm: These asserts can be removed. I decided that there's no point to checking if src0/src1 are…
return nullptr;		return nullptr;

MachineOperand &Src1 = MI->getOperand(Src1Idx);		MachineOperand &Src1 = MI->getOperand(Src1Idx);

// Make sure it's legal to commute operands for VOP2.		// Make sure it's legal to commute operands for VOP2.
if (isVOP2(MI->getOpcode()) &&		if (isVOP2(MI->getOpcode()) &&
(!isOperandLegal(MI, Src0Idx, &Src1) \|\|		(!isOperandLegal(MI, Src0Idx, &Src1) \|\|
!isOperandLegal(MI, Src1Idx, &Src0))) {		!isOperandLegal(MI, Src1Idx, &Src0))) {
Show All 31 Lines	if (!Src1.isReg()) {
if (Src1.isImm())		if (Src1.isImm())
Src0.ChangeToImmediate(Src1.getImm());		Src0.ChangeToImmediate(Src1.getImm());
else		else
llvm_unreachable("Should only have immediates");		llvm_unreachable("Should only have immediates");

Src1.ChangeToRegister(Reg, false);		Src1.ChangeToRegister(Reg, false);
Src1.setSubReg(SubReg);		Src1.setSubReg(SubReg);
} else {		} else {
MI = TargetInstrInfo::commuteInstruction(MI, NewMI);		MI = TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx0, OpIdx1);
}		}

if (MI)		if (MI)
MI->setDesc(get(CommutedOpcode));		MI->setDesc(get(CommutedOpcode));

return MI;		return MI;
}		}

// This needs to be implemented because the source modifiers may be inserted		// This needs to be implemented because the source modifiers may be inserted
// between the true commutable operands, and the base		// between the true commutable operands, and the base
// TargetInstrInfo::commuteInstruction uses it.		// TargetInstrInfo::commuteInstruction uses it.
bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,		bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
unsigned &SrcOpIdx1,		unsigned &SrcOpIdx0,
unsigned &SrcOpIdx2) const {		unsigned &SrcOpIdx1) const {
const MCInstrDesc &MCID = MI->getDesc();		const MCInstrDesc &MCID = MI->getDesc();
if (!MCID.isCommutable())		if (!MCID.isCommutable())
return false;		return false;

unsigned Opc = MI->getOpcode();		unsigned Opc = MI->getOpcode();
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);		int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
if (Src0Idx == -1)		if (Src0Idx == -1)
return false;		return false;

// FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on		// FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
// immediate.		// immediate. Also, immeditate src0 operand is not handled in
		// SIInstrInfo::commuteInstruction();
		arsenmUnsubmitted Not Done Reply Inline Actions Typo: immeditate arsenm: Typo: immeditate
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Fixed: immeditate -> immediate. v_klochkov: Fixed: immeditate -> immediate.
if (!MI->getOperand(Src0Idx).isReg())		if (!MI->getOperand(Src0Idx).isReg())
return false;		return false;

int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);		int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
if (Src1Idx == -1)		if (Src1Idx == -1)
return false;		return false;

if (!MI->getOperand(Src1Idx).isReg())		MachineOperand &Src1 = MI->getOperand(Src1Idx);
		if (Src1.isImm()) {
		// SIInstrInfo::commuteInstruction() does support commuting the immediate
		// operand src1 in 2 and 3 operand instructions.
		if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))
		qcolombetUnsubmitted Not Done Reply Inline Actions I am guessing that you shouldn’t be the one doing this can of changes. Or at least, it should be a separate patch. qcolombet: I am guessing that you shouldn’t be the one doing this can of changes. Or at least, it should…
		arsenmUnsubmitted Not Done Reply Inline Actions It's not entirely accurate to use isVOP2 / isVOP3 for checking the number of operands. VOP* instructions are always available in a VOP3 encoding, but will still have < 3 operands. Checking if AMDGPU::OpName::src2 is a valid operand is a more reliable check. SALU instructions with an immediate can also be commuted, although there is less reason to do so other than canonicalization. Although it looks like isVOP2/isVOP3 is what commuteInstruction already checks so I guess this is OK for now. arsenm: It's not entirely accurate to use isVOP2 / isVOP3 for checking the number of operands. VOP*…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Thank you for the explanations. I am quite happy that you are Ok with the current version of the changes as the fixing of such subtle things should be done by AMDGPU experts. In this change-set I just synchronized the checks in findOpIndicesToCommute() and commuteInstructionImpl() (i.e. re-used the checks from commuteInstructionImpl() in findOpIndicesToCommute()). v_klochkov: Thank you for the explanations. I am quite happy that you are Ok with the current version of…
return false;		return false;
		}
		else if (Src1.isReg()) {
// If any source modifiers are set, the generic instruction commuting won't		// If any source modifiers are set, the generic instruction commuting won't
// understand how to copy the source modifiers.		// understand how to copy the source modifiers.
if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) \|\|		if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) \|\|
hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))		hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
return false;		return false;
		}
		else
		return false;

SrcOpIdx1 = Src0Idx;		return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
SrcOpIdx2 = Src1Idx;
return true;
}		}

MachineInstr SIInstrInfo::buildMovInstr(MachineBasicBlock MBB,		MachineInstr SIInstrInfo::buildMovInstr(MachineBasicBlock MBB,
MachineBasicBlock::iterator I,		MachineBasicBlock::iterator I,
unsigned DstReg,		unsigned DstReg,
unsigned SrcReg) const {		unsigned SrcReg) const {
return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32),		return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32),
DstReg) .addReg(SrcReg);		DstReg) .addReg(SrcReg);
▲ Show 20 Lines • Show All 830 Lines • ▼ Show 20 Lines	if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
// Legalize src1		// Legalize src1
if (isOperandLegal(MI, Src1Idx))		if (isOperandLegal(MI, Src1Idx))
return;		return;

// Usually src0 of VOP2 instructions allow more types of inputs		// Usually src0 of VOP2 instructions allow more types of inputs
// than src1, so try to commute the instruction to decrease our		// than src1, so try to commute the instruction to decrease our
// chances of having to insert a MOV instruction to legalize src1.		// chances of having to insert a MOV instruction to legalize src1.
if (MI->isCommutable()) {		if (MI->isCommutable()) {
if (commuteInstruction(MI))		if (TargetInstrInfo::commuteInstruction(MI))
// If we are successful in commuting, then we know MI is legal, so		// If we are successful in commuting, then we know MI is legal, so
// we are done.		// we are done.
return;		return;
}		}

legalizeOpWithMove(MI, Src1Idx);		legalizeOpWithMove(MI, Src1Idx);
return;		return;
}		}
▲ Show 20 Lines • Show All 1,046 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

Show First 20 Lines • Show All 176 Lines • ▼ Show 20 Lines	if (Def && Def->isMoveImmediate()) {
Def->eraseFromParent();		Def->eraseFromParent();
++NumLiteralConstantsFolded;		++NumLiteralConstantsFolded;
return;		return;
}		}
}		}
}		}

// We have failed to fold src0, so commute the instruction and try again.		// We have failed to fold src0, so commute the instruction and try again.
if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI))		if (TryToCommute && MI.isCommutable() &&
		TII->TargetInstrInfo::commuteInstruction(&MI))
foldImmediates(MI, TII, MRI, false);		foldImmediates(MI, TII, MRI, false);

}		}

bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {		bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();		MachineRegisterInfo &MRI = MF.getRegInfo();
const SIInstrInfo *TII =		const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());		static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
Show All 22 Lines	for (I = MBB.begin(); I != MBB.end(); I = Next) {
}		}

if (!TII->hasVALU32BitEncoding(MI.getOpcode()))		if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
continue;		continue;

if (!canShrink(MI, TII, TRI, MRI)) {		if (!canShrink(MI, TII, TRI, MRI)) {
// Try commuting the instruction and see if that enables us to shrink		// Try commuting the instruction and see if that enables us to shrink
// it.		// it.
if (!MI.isCommutable() \|\| !TII->commuteInstruction(&MI) \|\|		if (!MI.isCommutable() \|\|
		!TII->TargetInstrInfo::commuteInstruction(&MI) \|\|
!canShrink(MI, TII, TRI, MRI))		!canShrink(MI, TII, TRI, MRI))
continue;		continue;
}		}

// getVOPe32 could be -1 here if we started with an instruction that had		// getVOPe32 could be -1 here if we started with an instruction that had
// a 32-bit encoding and then commuted it to an instruction that did not.		// a 32-bit encoding and then commuted it to an instruction that did not.
if (!TII->hasVALU32BitEncoding(MI.getOpcode()))		if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
continue;		continue;
▲ Show 20 Lines • Show All 73 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMBaseInstrInfo.h

Show First 20 Lines • Show All 182 Lines • ▼ Show 20 Lines	public:
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,		void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
unsigned DestReg, unsigned SubIdx,		unsigned DestReg, unsigned SubIdx,
const MachineInstr *Orig,		const MachineInstr *Orig,
const TargetRegisterInfo &TRI) const override;		const TargetRegisterInfo &TRI) const override;

MachineInstr duplicate(MachineInstr Orig,		MachineInstr duplicate(MachineInstr Orig,
MachineFunction &MF) const override;		MachineFunction &MF) const override;

MachineInstr commuteInstruction(MachineInstr,		MachineInstr commuteInstruction(MachineInstr MI,
bool=false) const override;		bool NewMI,
		unsigned OpIdx1,
		unsigned OpIdx2) const override;

const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,		const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
unsigned SubIdx, unsigned State,		unsigned SubIdx, unsigned State,
const TargetRegisterInfo *TRI) const;		const TargetRegisterInfo *TRI) const;

bool produceSameValue(const MachineInstr MI0, const MachineInstr MI1,		bool produceSameValue(const MachineInstr MI0, const MachineInstr MI1,
const MachineRegisterInfo *MRI) const override;		const MachineRegisterInfo *MRI) const override;

▲ Show 20 Lines • Show All 298 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp

Show First 20 Lines • Show All 1,738 Lines • ▼ Show 20 Lines	unsigned llvm::getMatchingCondBranchOpcode(unsigned Opc) {
if (Opc == ARM::tB)		if (Opc == ARM::tB)
return ARM::tBcc;		return ARM::tBcc;
if (Opc == ARM::t2B)		if (Opc == ARM::t2B)
return ARM::t2Bcc;		return ARM::t2Bcc;

llvm_unreachable("Unknown unconditional branch opcode!");		llvm_unreachable("Unknown unconditional branch opcode!");
}		}

/// commuteInstruction - Handle commutable instructions.		/// Commutes the operands in the given instruction.
		/// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
		///
		/// Do not call this method for a non-commutable instruction or for
		/// non-commutable pair of operand indices OpIdx1 and OpIdx2.
		/// Even though the instruction is commutable, the method may still
		/// fail to commute the operands, null pointer is returned in such cases.
		///
MachineInstr *		MachineInstr *
ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {		ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI,
		unsigned OpIdx1, unsigned OpIdx2) const {
switch (MI->getOpcode()) {		switch (MI->getOpcode()) {
case ARM::MOVCCr:		case ARM::MOVCCr:
case ARM::t2MOVCCr: {		case ARM::t2MOVCCr: {
// MOVCC can be commuted by inverting the condition.		// MOVCC can be commuted by inverting the condition.
unsigned PredReg = 0;		unsigned PredReg = 0;
ARMCC::CondCodes CC = getInstrPredicate(MI, PredReg);		ARMCC::CondCodes CC = getInstrPredicate(MI, PredReg);
// MOVCC AL can't be inverted. Shouldn't happen.		// MOVCC AL can't be inverted. Shouldn't happen.
if (CC == ARMCC::AL \|\| PredReg != ARM::CPSR)		if (CC == ARMCC::AL \|\| PredReg != ARM::CPSR)
return nullptr;		return nullptr;
MI = TargetInstrInfo::commuteInstruction(MI, NewMI);		MI = TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
if (!MI)		if (!MI)
return nullptr;		return nullptr;
// After swapping the MOVCC operands, also invert the condition.		// After swapping the MOVCC operands, also invert the condition.
MI->getOperand(MI->findFirstPredOperandIdx())		MI->getOperand(MI->findFirstPredOperandIdx())
.setImm(ARMCC::getOppositeCondition(CC));		.setImm(ARMCC::getOppositeCondition(CC));
return MI;		return MI;
}		}
}		}
return TargetInstrInfo::commuteInstruction(MI, NewMI);		return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
}		}

/// Identify instructions that can be folded into a MOVCC instruction, and		/// Identify instructions that can be folded into a MOVCC instruction, and
/// return the defining instruction.		/// return the defining instruction.
static MachineInstr *canFoldIntoMOVCC(unsigned Reg,		static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
const MachineRegisterInfo &MRI,		const MachineRegisterInfo &MRI,
const TargetInstrInfo *TII) {		const TargetInstrInfo *TII) {
if (!TargetRegisterInfo::isVirtualRegister(Reg))		if (!TargetRegisterInfo::isVirtualRegister(Reg))
▲ Show 20 Lines • Show All 2,811 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/Thumb2SizeReduction.cpp

Show First 20 Lines • Show All 648 Lines • ▼ Show 20 Lines	if (!isARMLowRegister(Reg0) \|\| !isARMLowRegister(Reg1)
\|\| !isARMLowRegister(Reg2))		\|\| !isARMLowRegister(Reg2))
return false;		return false;
if (Reg0 != Reg2) {		if (Reg0 != Reg2) {
// If the other operand also isn't the same as the destination, we		// If the other operand also isn't the same as the destination, we
// can't reduce.		// can't reduce.
if (Reg1 != Reg0)		if (Reg1 != Reg0)
return false;		return false;
// Try to commute the operands to make it a 2-address instruction.		// Try to commute the operands to make it a 2-address instruction.
MachineInstr *CommutedMI = TII->commuteInstruction(MI);		MachineInstr *CommutedMI = TII->TargetInstrInfo::commuteInstruction(MI);
if (!CommutedMI)		if (!CommutedMI)
return false;		return false;
}		}
} else if (Reg0 != Reg1) {		} else if (Reg0 != Reg1) {
// Try to commute the operands to make it a 2-address instruction.		// Try to commute the operands to make it a 2-address instruction.
unsigned CommOpIdx1, CommOpIdx2;		unsigned CommOpIdx1 = 1, CommOpIdx2 = ~0U;
if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) \|\|		if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) \|\|
CommOpIdx1 != 1 \|\| MI->getOperand(CommOpIdx2).getReg() != Reg0)		MI->getOperand(CommOpIdx2).getReg() != Reg0)
return false;		return false;
MachineInstr *CommutedMI = TII->commuteInstruction(MI);		MachineInstr *CommutedMI = TII->commuteInstruction(MI, false,
		CommOpIdx1, CommOpIdx2);
if (!CommutedMI)		if (!CommutedMI)
return false;		return false;
}		}
if (Entry.LowRegs2 && !isARMLowRegister(Reg0))		if (Entry.LowRegs2 && !isARMLowRegister(Reg0))
return false;		return false;
if (Entry.Imm2Limit) {		if (Entry.Imm2Limit) {
unsigned Imm = MI->getOperand(2).getImm();		unsigned Imm = MI->getOperand(2).getImm();
unsigned Limit = (1 << Entry.Imm2Limit) - 1;		unsigned Limit = (1 << Entry.Imm2Limit) - 1;
▲ Show 20 Lines • Show All 362 Lines • Show Last 20 Lines

llvm/lib/Target/PowerPC/PPCInstrInfo.h

Show First 20 Lines • Show All 153 Lines • ▼ Show 20 Lines	public:
bool isCoalescableExtInstr(const MachineInstr &MI,		bool isCoalescableExtInstr(const MachineInstr &MI,
unsigned &SrcReg, unsigned &DstReg,		unsigned &SrcReg, unsigned &DstReg,
unsigned &SubIdx) const override;		unsigned &SubIdx) const override;
unsigned isLoadFromStackSlot(const MachineInstr *MI,		unsigned isLoadFromStackSlot(const MachineInstr *MI,
int &FrameIndex) const override;		int &FrameIndex) const override;
unsigned isStoreToStackSlot(const MachineInstr *MI,		unsigned isStoreToStackSlot(const MachineInstr *MI,
int &FrameIndex) const override;		int &FrameIndex) const override;

// commuteInstruction - We can commute rlwimi instructions, but only if the		/// Commutes the operands in the given instruction.
// rotate amt is zero. We also have to munge the immediates a bit.		/// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
MachineInstr commuteInstruction(MachineInstr MI, bool NewMI) const override;		///
		/// Do not call this method for a non-commutable instruction or for
		/// non-commutable pair of operand indices OpIdx1 and OpIdx2.
		/// Even though the instruction is commutable, the method may still
		/// fail to commute the operands, null pointer is returned in such cases.
		///
		/// For example, we can commute rlwimi instructions, but only if the
		/// rotate amt is zero. We also have to munge the immediates a bit.
		///
		MachineInstr commuteInstruction(MachineInstr MI,
		bool NewMI,
		unsigned OpIdx1,
		unsigned OpIdx2) const override;

bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,		bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;		unsigned &SrcOpIdx2) const override;

void insertNoop(MachineBasicBlock &MBB,		void insertNoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const override;		MachineBasicBlock::iterator MI) const override;


▲ Show 20 Lines • Show All 106 Lines • Show Last 20 Lines

llvm/lib/Target/PowerPC/PPCInstrInfo.cpp

Show First 20 Lines • Show All 515 Lines • ▼ Show 20 Lines	if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() &&
FrameIndex = MI->getOperand(2).getIndex();		FrameIndex = MI->getOperand(2).getIndex();
return MI->getOperand(0).getReg();		return MI->getOperand(0).getReg();
}		}
break;		break;
}		}
return 0;		return 0;
}		}

// commuteInstruction - We can commute rlwimi instructions, but only if the		/// Commutes the operands in the given instruction.
// rotate amt is zero. We also have to munge the immediates a bit.		/// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
		///
		/// Do not call this method for a non-commutable instruction or for
		/// non-commutable pair of operand indices OpIdx1 and OpIdx2.
		/// Even though the instruction is commutable, the method may still
		/// fail to commute the operands, null pointer is returned in such cases.
		///
		/// For example, we can commute rlwimi instructions, but only if the
		/// rotate amt is zero. We also have to munge the immediates a bit.
		///
MachineInstr *		MachineInstr *
PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {		PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI,
		unsigned OpIdx1, unsigned OpIdx2) const {
MachineFunction &MF = *MI->getParent()->getParent();		MachineFunction &MF = *MI->getParent()->getParent();

// Normal instructions can be commuted the obvious way.		// Normal instructions can be commuted the obvious way.
if (MI->getOpcode() != PPC::RLWIMI &&		if (MI->getOpcode() != PPC::RLWIMI &&
MI->getOpcode() != PPC::RLWIMIo)		MI->getOpcode() != PPC::RLWIMIo)
return TargetInstrInfo::commuteInstruction(MI, NewMI);		return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
// Note that RLWIMI can be commuted as a 32-bit instruction, but not as a		// Note that RLWIMI can be commuted as a 32-bit instruction, but not as a
// 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because		// 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because
// changing the relative order of the mask operands might change what happens		// changing the relative order of the mask operands might change what happens
// to the high-bits of the mask (and, thus, the result).		// to the high-bits of the mask (and, thus, the result).

// Cannot commute if it has a non-zero rotate count.		// Cannot commute if it has a non-zero rotate count.
if (MI->getOperand(3).getImm() != 0)		if (MI->getOperand(3).getImm() != 0)
return nullptr;		return nullptr;

// If we have a zero rotate count, we have:		// If we have a zero rotate count, we have:
// M = mask(MB,ME)		// M = mask(MB,ME)
// Op0 = (Op1 & ~M) \| (Op2 & M)		// Op0 = (Op1 & ~M) \| (Op2 & M)
// Change this to:		// Change this to:
// M = mask((ME+1)&31, (MB-1)&31)		// M = mask((ME+1)&31, (MB-1)&31)
// Op0 = (Op2 & ~M) \| (Op1 & M)		// Op0 = (Op2 & ~M) \| (Op1 & M)

// Swap op1/op2		// Swap op1/op2
		assert(((OpIdx1 == 1 && OpIdx2 == 2) \|\| (OpIdx1 == 2 && OpIdx2 == 1)) &&
		"Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo.");
unsigned Reg0 = MI->getOperand(0).getReg();		unsigned Reg0 = MI->getOperand(0).getReg();
unsigned Reg1 = MI->getOperand(1).getReg();		unsigned Reg1 = MI->getOperand(1).getReg();
unsigned Reg2 = MI->getOperand(2).getReg();		unsigned Reg2 = MI->getOperand(2).getReg();
unsigned SubReg1 = MI->getOperand(1).getSubReg();		unsigned SubReg1 = MI->getOperand(1).getSubReg();
unsigned SubReg2 = MI->getOperand(2).getSubReg();		unsigned SubReg2 = MI->getOperand(2).getSubReg();
bool Reg1IsKill = MI->getOperand(1).isKill();		bool Reg1IsKill = MI->getOperand(1).isKill();
bool Reg2IsKill = MI->getOperand(2).isKill();		bool Reg2IsKill = MI->getOperand(2).isKill();
bool ChangeReg0 = false;		bool ChangeReg0 = false;
▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	bool PPCInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
// For VSX A-Type FMA instructions, it is the first two operands that can be		// For VSX A-Type FMA instructions, it is the first two operands that can be
// commuted, however, because the non-encoded tied input operand is listed		// commuted, however, because the non-encoded tied input operand is listed
// first, the operands to swap are actually the second and third.		// first, the operands to swap are actually the second and third.

int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode());		int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode());
if (AltOpc == -1)		if (AltOpc == -1)
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);		return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);

SrcOpIdx1 = 2;		// The commutable operand indices are 2 and 3. Return them in SrcOpIdx1
SrcOpIdx2 = 3;		// and SrcOpIdx2.
return true;		return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
}		}

void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,		void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const {		MachineBasicBlock::iterator MI) const {
// This function is used for scheduling, and the nop wanted here is the type		// This function is used for scheduling, and the nop wanted here is the type
// that terminates dispatch groups on the POWER cores.		// that terminates dispatch groups on the POWER cores.
unsigned Directive = Subtarget.getDarwinDirective();		unsigned Directive = Subtarget.getDarwinDirective();
unsigned Opcode;		unsigned Opcode;
▲ Show 20 Lines • Show All 1,373 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86InstrFMA.td

Show First 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
(MemFrag256 addr:$src3))))]>, VEX_L;		(MemFrag256 addr:$src3))))]>, VEX_L;
}		}
} // Constraints = "$src1 = $dst"		} // Constraints = "$src1 = $dst"

multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,		multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpcodeStr, string PackTy,		string OpcodeStr, string PackTy,
PatFrag MemFrag128, PatFrag MemFrag256,		PatFrag MemFrag128, PatFrag MemFrag256,
SDNode Op, ValueType OpTy128, ValueType OpTy256> {		SDNode Op, ValueType OpTy128, ValueType OpTy256> {
// For 213, both the register and memory variant are commutable.		let hasSideEffects = 0 in {
// Indeed, the commutable operands are 1 and 2 and both live in registers		// For 213, both the register and memory variants are commutable.
// for both variants.		// For the register form the commutable operands are 1, 2 and 3.
		// For the memory variant the folded operand must be in 3. Thus,
		// in that case, only the operands 1 and 2 can be swapped.
		// Commuting some of operands may require the opcode change:
		// operands 1 and 2 (memory & register forms): 213 --> 213(no changes);
		// operands 1 and 3 (register forms only): 213 --> 231;
		// operands 2 and 3 (register forms only): 213 --> 132.
defm r213 : fma3p_rm<opc213,		defm r213 : fma3p_rm<opc213,
!strconcat(OpcodeStr, "213", PackTy),		!strconcat(OpcodeStr, "213", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256,		MemFrag128, MemFrag256, OpTy128, OpTy256,
/* IsRVariantCommutable */ 1,		/* IsRVariantCommutable */ 1,
/* IsMVariantCommutable */ 1,		/* IsMVariantCommutable */ 1,
Op>;		Op>;
let hasSideEffects = 0 in {		// For 132, both the register and memory variants are commutable.
		// For the register form the commutable operands are 1, 2 and 3.
		// For the memory variant the folded operand must be in 3. Thus,
		// in that case, only the operands 1 and 2 can be swapped.
		// Commuting some of operands may require the opcode change:
		// operands 1 and 2 (memory & register forms): 132 --> 231;
		// operands 1 and 3 (register forms only): 132 --> 132(no changes);
		// operands 2 and 3 (register forms only): 132 --> 213.
defm r132 : fma3p_rm<opc132,		defm r132 : fma3p_rm<opc132,
!strconcat(OpcodeStr, "132", PackTy),		!strconcat(OpcodeStr, "132", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256>;		MemFrag128, MemFrag256, OpTy128, OpTy256,
// For 231, only the register variant is commutable.		/* IsRVariantCommutable */ 1,
		/* IsMVariantCommutable */ 1>;
		// For 231, both the register and memory variants are commutable.
		// For the register form the commutable operands are 1, 2 and 3.
// For the memory variant the folded operand must be in 3. Thus,		// For the memory variant the folded operand must be in 3. Thus,
// in that case, it cannot be swapped with 2.		// in that case, only the operands 1 and 2 can be swapped.
		// Commuting some of operands may require the opcode change:
		// operands 1 and 2 (memory & register forms): 231 --> 132;
		// operands 1 and 3 (register forms only): 231 --> 213;
		// operands 2 and 3 (register forms only): 231 --> 231(no changes).
defm r231 : fma3p_rm<opc231,		defm r231 : fma3p_rm<opc231,
!strconcat(OpcodeStr, "231", PackTy),		!strconcat(OpcodeStr, "231", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256,		MemFrag128, MemFrag256, OpTy128, OpTy256,
/* IsRVariantCommutable */ 1,		/* IsRVariantCommutable */ 1,
/* IsMVariantCommutable */ 0>;		/* IsMVariantCommutable */ 1>;
} // hasSideEffects = 0		} // hasSideEffects = 0
}		}

// Fused Multiply-Add		// Fused Multiply-Add
let ExeDomain = SSEPackedSingle in {		let ExeDomain = SSEPackedSingle in {
defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32,		defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32,
loadv8f32, X86Fmadd, v4f32, v8f32>;		loadv8f32, X86Fmadd, v4f32, v8f32>;
defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32,		defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32,
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines
} // Constraints = "$src1 = $dst"		} // Constraints = "$src1 = $dst"

multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,		multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpStr, string PackTy, string PT2, Intrinsic Int,		string OpStr, string PackTy, string PT2, Intrinsic Int,
SDNode OpNode, RegisterClass RC, ValueType OpVT,		SDNode OpNode, RegisterClass RC, ValueType OpVT,
X86MemOperand x86memop, Operand memop, PatFrag mem_frag,		X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
ComplexPattern mem_cpat> {		ComplexPattern mem_cpat> {
let hasSideEffects = 0 in {		let hasSideEffects = 0 in {
		// For 132, both the register and memory variants are commutable.
		// For the register form the commutable operands are 1, 2 and 3.
		// For the memory variant the folded operand must be in 3. Thus,
		// in that case, only the operands 1 and 2 can be swapped.
		// Commuting some of operands may require the opcode change:
		// operands 1 and 2 (memory & register forms): 132 --> 231;
		// operands 1 and 3 (register forms only): 132 --> 132(no changes);
		// operands 2 and 3 (register forms only): 132 --> 213.
		// Commuting the operand 1 with some other operand changes the upper bits
		// of the result FMA instruction. Thus, it requires a proof of the fact that
		// only the lowest element of the result is used.
defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),		defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
x86memop, RC, OpVT, mem_frag>;		x86memop, RC, OpVT, mem_frag,
// See the other defm of r231 for the explanation regarding the		/* IsRVariantCommutable */ 1,
// commutable flags.		/* IsMVariantCommutable */ 1>;
		// For 231, both the register and memory variants are commutable.
		// For the register form the commutable operands are 1, 2 and 3.
		// For the memory variant the folded operand must be in 3. Thus,
		// in that case, only the operands 1 and 2 can be swapped.
		// Commuting some of operands may require the opcode change:
		// operands 1 and 2 (memory & register forms): 231 --> 132;
		// operands 1 and 3 (register forms only): 231 --> 213;
		// operands 2 and 3 (register forms only): 231 --> 231(no changes).
		// Commuting the operand 1 with some other operand changes the upper bits
		// of the result FMA instruction. Thus, it requires a proof of the fact that
		// only the lowest element of the result is used.
defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),		defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
x86memop, RC, OpVT, mem_frag,		x86memop, RC, OpVT, mem_frag,
/* IsRVariantCommutable */ 1,		/* IsRVariantCommutable */ 1,
/* IsMVariantCommutable */ 0>;		/* IsMVariantCommutable */ 1>;
}

// See the other defm of r213 for the explanation regarding the		// For 213, both the register and memory variants are commutable.
// commutable flags.		// For the register form the commutable operands are 1, 2 and 3.
		// For the memory variant the folded operand must be in 3. Thus,
		// in that case, only the operands 1 and 2 can be swapped.
		// Commuting some of operands may require the opcode change:
		// operands 1 and 2 (memory & register forms): 213 --> 213(no changes);
		// operands 1 and 3 (register forms only): 213 --> 231;
		// operands 2 and 3 (register forms only): 213 --> 132.
		// Commuting the operand 1 with some other operand changes the upper bits
		// of the result FMA instruction. Thus, it requires a proof of the fact that
		// only the lowest element of the result is used.
defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),		defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
x86memop, RC, OpVT, mem_frag,		x86memop, RC, OpVT, mem_frag,
/* IsRVariantCommutable */ 1,		/* IsRVariantCommutable */ 1,
/* IsMVariantCommutable */ 1,		/* IsMVariantCommutable */ 1,
OpNode>;		OpNode>;
}		}
		}

multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,		multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpStr, Intrinsic IntF32, Intrinsic IntF64,		string OpStr, Intrinsic IntF32, Intrinsic IntF64,
SDNode OpNode> {		SDNode OpNode> {
defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,		defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,
FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;		FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,		defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,
FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;		FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
▲ Show 20 Lines • Show All 211 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86InstrInfo.h

Show First 20 Lines • Show All 253 Lines • ▼ Show 20 Lines	public:
///		///
/// This method returns a null pointer if the transformation cannot be		/// This method returns a null pointer if the transformation cannot be
/// performed, otherwise it returns the new instruction.		/// performed, otherwise it returns the new instruction.
///		///
MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,		MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
MachineBasicBlock::iterator &MBBI,		MachineBasicBlock::iterator &MBBI,
LiveVariables *LV) const override;		LiveVariables *LV) const override;

/// commuteInstruction - We have a few instructions that must be hacked on to		/// Commutes the operands in the given instruction by changing the operands
/// commute them.		/// order and/or changing the instruction's opcode and/or the immediate value
		/// operand.
		///
		/// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands
		/// to be commuted.
		///
		/// Do not call this method for a non-commutable instruction.
		/// Even though the instruction is commutable, the method may still
		/// fail to commute the operands, null pointer is returned in such cases.
		///
		MachineInstr commuteInstruction(MachineInstr MI, bool NewMI,
		unsigned CommuteOpIdx1,
		unsigned CommuteOpIdx2) const override;

		/// Returns true iff the routine could find two commutable operands in the
		/// given machine instruction.
		/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
		/// input values can be re-defined in this method only if the input values
		/// are not pre-defined, which is designated by the special value ~0U
		/// assigned to it.
		/// If both of indices are pre-defined and refer to some operands, then the
		/// method simply returns true if the corresponding operands are commutable
		/// and returns false otherwise.
		///
		/// For example, calling this method this way:
		/// unsigned Op1 = 1, Op2 = ~0U;
		/// findCommutedOpIndices(MI, Op1, Op2);
		/// can be interpreted as a query asking to find an operand that would be
		/// commutable with the operand#1.
///		///
MachineInstr commuteInstruction(MachineInstr MI, bool NewMI) const override;

bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,		bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;		unsigned &SrcOpIdx2) const override;

		/// Returns true if the routine could find two commutable operands
		/// in the given FMA instruction. Otherwise, returns false.
		///
		/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments.
		/// The output indices of the commuted operands are returned in these
		/// arguments. Also, the input values of these arguments may be preset either
		/// to indices of operands that must be commuted or be equal to a special
		/// value (~0U) which means that the corresponding operand index is not set
		/// and this method is free to pick any of available commutable operands.
		///
		/// For example, calling this method this way:
		/// findFMA3CommutedOpIndices(MI, 1, ~0U);
		/// can be interpreted as a query asking if the operand #1 can be swapped
		/// with any other available operand (e.g. operand #2, operand #3, etc.).
		///
		/// The returned FMA opcode may differ from the opcode in the given MI.
		/// For example, commuting the operands #1 and #3 in the following FMA
		/// FMA213 #1, #2, #3
		/// results into instruction with adjusted opcode:
		/// FMA231 #3, #2, #1
		///
		bool findFMA3CommutedOpIndices(MachineInstr *MI,
		unsigned &SrcOpIdx1,
		unsigned &SrcOpIdx2) const;

		/// Returns an adjusted FMA opcode that must be used in FMA instruction that
		/// performs the same computations as the given MI but which has the operands
		/// SrcOpIdx1 and SrcOpIdx2 commuted.
		/// It may return 0 if it is unsafe to commute the operands.
		///
		/// The returned FMA opcode may differ from the opcode in the given MI.
		/// For example, commuting the operands #1 and #3 in the following FMA
		/// FMA213 #1, #2, #3
		/// results into instruction with adjusted opcode:
		/// FMA231 #3, #2, #1
		///
		unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
		unsigned SrcOpIdx1,
		unsigned SrcOpIdx2) const;

		/// Returns true if the given instruction opcode is FMA3.
		/// Otherwise, returns false.
		///
		bool isFMA3(unsigned Opcode) const;

// Branch analysis.		// Branch analysis.
bool isUnpredicatedTerminator(const MachineInstr* MI) const override;		bool isUnpredicatedTerminator(const MachineInstr* MI) const override;
bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,		bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,		MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,		SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const override;		bool AllowModify) const override;

bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,		bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
▲ Show 20 Lines • Show All 235 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86InstrInfo.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 2,917 Lines • ▼ Show 20 Lines	if (LV) { // Update live variables
if (Dest.isDead())		if (Dest.isDead())
LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);		LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
}		}

MFI->insert(MBBI, NewMI); // Insert the new inst		MFI->insert(MBBI, NewMI); // Insert the new inst
return NewMI;		return NewMI;
}		}

/// We have a few instructions that must be hacked on to commute them.		/// Commutes the operands in the given instruction by changing the operands
		/// order and/or changing the instruction's opcode and/or the immediate value
		/// operand.
		/// The arguments 'OpIdx1' and 'OpIdx2' specify the operands to be commuted.
		///
		/// Do not call this method for a non-commutable instruction.
		delenaUnsubmitted Not Done Reply Inline Actions I suggest to change from "Do not call" to "If you call" delena: I suggest to change from "Do not call" to "If you call"
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions This "Do not call" comment was moved to here from the old version of include/llvm/Target/TargetInstrInfo.h The TargetInstrInfo::commuteInstruction() has assert verifying that MI is commutable. After taking that assert into account this comment seems quite precise. v_klochkov: This "Do not call" comment was moved to here from the old version of…
		/// Even though the instruction is commutable, the method may still
		/// fail to commute the operands, null pointer is returned in such cases.
///		///
MachineInstr *		MachineInstr *
X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {		X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI,
		unsigned OpIdx1,
		unsigned OpIdx2) const {


		delenaUnsubmitted Not Done Reply Inline Actions Please remove one empty line. delena: Please remove one empty line.
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Ok, removed it, the updated version of the change-set will have this fix. v_klochkov: Ok, removed it, the updated version of the change-set will have this fix.
switch (MI->getOpcode()) {		switch (MI->getOpcode()) {
case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)		case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)		case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)		case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)		case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)		case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)		case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
unsigned Opc;		unsigned Opc;
Show All 10 Lines	case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
unsigned Amt = MI->getOperand(3).getImm();		unsigned Amt = MI->getOperand(3).getImm();
if (NewMI) {		if (NewMI) {
MachineFunction &MF = *MI->getParent()->getParent();		MachineFunction &MF = *MI->getParent()->getParent();
MI = MF.CloneMachineInstr(MI);		MI = MF.CloneMachineInstr(MI);
NewMI = false;		NewMI = false;
}		}
MI->setDesc(get(Opc));		MI->setDesc(get(Opc));
MI->getOperand(3).setImm(Size-Amt);		MI->getOperand(3).setImm(Size-Amt);
return TargetInstrInfo::commuteInstruction(MI, NewMI);		return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
}		}
case X86::BLENDPDrri:		case X86::BLENDPDrri:
case X86::BLENDPSrri:		case X86::BLENDPSrri:
case X86::PBLENDWrri:		case X86::PBLENDWrri:
case X86::VBLENDPDrri:		case X86::VBLENDPDrri:
case X86::VBLENDPSrri:		case X86::VBLENDPSrri:
case X86::VBLENDPDYrri:		case X86::VBLENDPDYrri:
case X86::VBLENDPSYrri:		case X86::VBLENDPSYrri:
Show All 19 Lines	case X86::VPBLENDWYrri:{
// Only the least significant bits of Imm are used.		// Only the least significant bits of Imm are used.
unsigned Imm = MI->getOperand(3).getImm() & Mask;		unsigned Imm = MI->getOperand(3).getImm() & Mask;
if (NewMI) {		if (NewMI) {
MachineFunction &MF = *MI->getParent()->getParent();		MachineFunction &MF = *MI->getParent()->getParent();
MI = MF.CloneMachineInstr(MI);		MI = MF.CloneMachineInstr(MI);
NewMI = false;		NewMI = false;
}		}
MI->getOperand(3).setImm(Mask ^ Imm);		MI->getOperand(3).setImm(Mask ^ Imm);
return TargetInstrInfo::commuteInstruction(MI, NewMI);		return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
}		}
case X86::PCLMULQDQrr:		case X86::PCLMULQDQrr:
case X86::VPCLMULQDQrr:{		case X86::VPCLMULQDQrr:{
// SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]		// SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
// SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]		// SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
unsigned Imm = MI->getOperand(3).getImm();		unsigned Imm = MI->getOperand(3).getImm();
unsigned Src1Hi = Imm & 0x01;		unsigned Src1Hi = Imm & 0x01;
unsigned Src2Hi = Imm & 0x10;		unsigned Src2Hi = Imm & 0x10;
if (NewMI) {		if (NewMI) {
MachineFunction &MF = *MI->getParent()->getParent();		MachineFunction &MF = *MI->getParent()->getParent();
MI = MF.CloneMachineInstr(MI);		MI = MF.CloneMachineInstr(MI);
NewMI = false;		NewMI = false;
}		}
MI->getOperand(3).setImm((Src1Hi << 4) \| (Src2Hi >> 4));		MI->getOperand(3).setImm((Src1Hi << 4) \| (Src2Hi >> 4));
return TargetInstrInfo::commuteInstruction(MI, NewMI);		return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
}		}
case X86::CMPPDrri:		case X86::CMPPDrri:
case X86::CMPPSrri:		case X86::CMPPSrri:
case X86::VCMPPDrri:		case X86::VCMPPDrri:
case X86::VCMPPSrri:		case X86::VCMPPSrri:
case X86::VCMPPDYrri:		case X86::VCMPPDYrri:
case X86::VCMPPSYrri: {		case X86::VCMPPSYrri: {
// Float comparison can be safely commuted for		// Float comparison can be safely commuted for
// Ordered/Unordered/Equal/NotEqual tests		// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI->getOperand(3).getImm() & 0x7;		unsigned Imm = MI->getOperand(3).getImm() & 0x7;
switch (Imm) {		switch (Imm) {
case 0x00: // EQUAL		case 0x00: // EQUAL
case 0x03: // UNORDERED		case 0x03: // UNORDERED
case 0x04: // NOT EQUAL		case 0x04: // NOT EQUAL
case 0x07: // ORDERED		case 0x07: // ORDERED
if (NewMI) {		if (NewMI) {
MachineFunction &MF = *MI->getParent()->getParent();		MachineFunction &MF = *MI->getParent()->getParent();
MI = MF.CloneMachineInstr(MI);		MI = MF.CloneMachineInstr(MI);
NewMI = false;		NewMI = false;
}		}
return TargetInstrInfo::commuteInstruction(MI, NewMI);		return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
default:		default:
return nullptr;		return nullptr;
}		}
}		}
case X86::VPCOMBri: case X86::VPCOMUBri:		case X86::VPCOMBri: case X86::VPCOMUBri:
case X86::VPCOMDri: case X86::VPCOMUDri:		case X86::VPCOMDri: case X86::VPCOMUDri:
case X86::VPCOMQri: case X86::VPCOMUQri:		case X86::VPCOMQri: case X86::VPCOMUQri:
case X86::VPCOMWri: case X86::VPCOMUWri: {		case X86::VPCOMWri: case X86::VPCOMUWri: {
Show All 12 Lines	default:
break;		break;
}		}
if (NewMI) {		if (NewMI) {
MachineFunction &MF = *MI->getParent()->getParent();		MachineFunction &MF = *MI->getParent()->getParent();
MI = MF.CloneMachineInstr(MI);		MI = MF.CloneMachineInstr(MI);
NewMI = false;		NewMI = false;
}		}
MI->getOperand(3).setImm(Imm);		MI->getOperand(3).setImm(Imm);
return TargetInstrInfo::commuteInstruction(MI, NewMI);		return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
}		}
case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:		case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:		case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:		case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:		case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:		case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr:		case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr:
case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr:		case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr:
▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	if (NewMI) {
MachineFunction &MF = *MI->getParent()->getParent();		MachineFunction &MF = *MI->getParent()->getParent();
MI = MF.CloneMachineInstr(MI);		MI = MF.CloneMachineInstr(MI);
NewMI = false;		NewMI = false;
}		}
MI->setDesc(get(Opc));		MI->setDesc(get(Opc));
// Fallthrough intended.		// Fallthrough intended.
}		}
default:		default:
return TargetInstrInfo::commuteInstruction(MI, NewMI);		if (isFMA3(MI->getOpcode())) {
		unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2);
		if (Opc == 0) {
		return nullptr;
		}
		if (NewMI) {
		MachineFunction &MF = *MI->getParent()->getParent();
		MI = MF.CloneMachineInstr(MI);
		NewMI = false;
		}
		MI->setDesc(get(Opc));
		return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
		}
		return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
}		}
}		}

bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,		///
		/// Returns true if the given instruction opcode is FMA3.
		/// Otherwise, returns false.
		///
		bool X86InstrInfo::isFMA3(unsigned Opcode) const {
		switch (Opcode) {
		delenaUnsubmitted Not Done Reply Inline Actions Looks huge. I'm not sure but may be these enums in ABC order and we can compare against first and last? Or auto-generate something? delena: Looks huge. I'm not sure but may be these enums in ABC order and we can compare against first…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Unfortunately it is huge, I agree. Comparing against the first and last or having some assumptions about how and in which order the opcodes were defined seems a very risky approach causing unexpected effects/errors in future. I am pretty sure that we should not go this way. I considered the idea of having a special bit for FMAs (something similar to the fields defined in llvm/include/llvm/Target/Target.td: isReturn,isBitcast,etc). Adding isFMA3 to there would be inappropriate as FMA3 is meaningful only for X86, while all other 32 1-bit fields defined there are quite generic and usable for all targets. Also, adding even 1 bit to there will increase the size of IR. Unfortunately, I could not find anything similar but for X86 platform only. v_klochkov: Unfortunately it is huge, I agree. Comparing against the first and last or having some…
		case X86::VFMADDSDr132r: case X86::VFMADDSDr132m:
		case X86::VFMADDSSr132r: case X86::VFMADDSSr132m:
		case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m:
		case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m:
		case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m:
		case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m:
		case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m:
		case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m:

		case X86::VFMADDSDr213r: case X86::VFMADDSDr213m:
		case X86::VFMADDSSr213r: case X86::VFMADDSSr213m:
		case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m:
		case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m:
		case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m:
		case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m:
		case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m:
		case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m:

		case X86::VFMADDSDr231r: case X86::VFMADDSDr231m:
		case X86::VFMADDSSr231r: case X86::VFMADDSSr231m:
		case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m:
		case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m:
		case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m:
		case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m:
		case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m:
		case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m:

		case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m:
		case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m:
		case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m:
		case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m:
		case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY:
		case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY:
		case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY:
		case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY:

		case X86::VFMADDPDr132r: case X86::VFMADDPDr132m:
		case X86::VFMADDPSr132r: case X86::VFMADDPSr132m:
		case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m:
		case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m:
		case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m:
		case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m:
		case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m:
		case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m:
		case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY:
		case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY:
		case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY:
		case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY:
		case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY:
		case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY:
		case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY:
		case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY:

		case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m:
		case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m:
		case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m:
		case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m:
		case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY:
		case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY:
		case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY:
		case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY:

		case X86::VFMADDPDr213r: case X86::VFMADDPDr213m:
		case X86::VFMADDPSr213r: case X86::VFMADDPSr213m:
		case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m:
		case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m:
		case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m:
		case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m:
		case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m:
		case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m:
		case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY:
		case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY:
		case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY:
		case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY:
		case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY:
		case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY:
		case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY:
		case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY:

		case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m:
		case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m:
		case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m:
		case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m:
		case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY:
		case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY:
		case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY:
		case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY:

		case X86::VFMADDPDr231r: case X86::VFMADDPDr231m:
		case X86::VFMADDPSr231r: case X86::VFMADDPSr231m:
		case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m:
		case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m:
		case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m:
		case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m:
		case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m:
		case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m:
		case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY:
		case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY:
		case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY:
		case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY:
		case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY:
		case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY:
		case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY:
		case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY:
		return true;
		default:
		break;
		}
		return false;
		}

		///
		/// Returns true if the routine could find two commutable operands
		/// in the given FMA instruction. Otherwise, returns false.
		///
		/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments.
		/// The output indices of the commuted operands are returned in these
		/// arguments. Also, the input values of these arguments may be preset either
		/// to indices of operands that must be commuted or be equal to a special
		/// value (~0U) which means that the corresponding operand index is not set
		/// and this method is free to pick any of available commutable operands.
		///
		/// For example, calling this method this way:
		/// findFMA3CommutedOpIndices(MI, 1, ~0U);
		/// can be interpreted as a query asking if the operand #1 can be swapped
		/// with any other available operand (e.g. operand #2, operand #3, etc.).
		///
		/// The returned FMA opcode may differ from the opcode in the given MI.
		/// For example, commuting the operands #1 and #3 in the following FMA
		/// FMA213 #1, #2, #3
		/// results into instruction with adjusted opcode:
		/// FMA231 #3, #2, #1
		///
		bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI,
		unsigned &SrcOpIdx1,
		unsigned &SrcOpIdx2) const {

		unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;

		//
		// Only the first RegOpsNum operands are commutable.
		// Also, the value ~0U is valid here as it means that the operand is not
		// specified/fixed.
		//
		if (SrcOpIdx1 < 1 \|\| (SrcOpIdx1 > RegOpsNum && SrcOpIdx1 != ~0U) \|\|
		SrcOpIdx2 < 1 \|\| (SrcOpIdx2 > RegOpsNum && SrcOpIdx2 != ~0U)) {
		return false;
		}

		if (SrcOpIdx1 == ~0U \|\| SrcOpIdx2 == ~0U) {
		unsigned CommutableOpIdx1 = SrcOpIdx1;
		unsigned CommutableOpIdx2 = SrcOpIdx2;

		//
		// At least one of operands to be commuted is not specified and
		// this method is free to choose appropriate commutable operands.
		//
		if (SrcOpIdx1 == SrcOpIdx2) {
		// Both of operands are not fixed. By default set one of commutable
		// operands to the last operand of the instruction.
		//
		CommutableOpIdx2 = RegOpsNum;
		}
		else if (SrcOpIdx2 == ~0U) {
		// Only one of operands is not fixed.
		//
		CommutableOpIdx2 = SrcOpIdx1;
		}

		// CommutableOpIdx2 is well defined now. Let's choose another commutable
		// operand and assign its index to CommutableOpIdx1.
		//
		unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg();
		for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
		// The commuted operands must have different registers.
		// Otherwise, the commute transformation does not change anything and
		// is useless then.
		//
		if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg())
		break;
		}

		// No appropriate commutable operands were found.
		//
		if (CommutableOpIdx1 == 0)
		return false;

		// Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
		// to return those values.
		if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
		CommutableOpIdx1, CommutableOpIdx2))
		return false;
		}
		return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0;
		}

		///
		/// Returns an adjusted FMA opcode that must be used in FMA instruction that
		/// performs the same computations as the given MI but which has the operands
		/// SrcOpIdx1 and SrcOpIdx2 commuted.
		/// It may return 0 if it is unsafe to commute the operands.
		///
		/// The returned FMA opcode may differ from the opcode in the given MI.
		/// For example, commuting the operands #1 and #3 in the following FMA
		/// FMA213 #1, #2, #3
		/// results into instruction with adjusted opcode:
		/// FMA231 #3, #2, #1
		///
		unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
		delenaUnsubmitted Not Done Reply Inline Actions This method may be static. Right? delena: This method may be static. Right?
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Yes, this method could be static and be similar to existing methods like "static bool isFrameLoadOpcode(int Opcode)", etc. The reason why I passed 'MachineInstruction' argument instead of 'Opcode' to this function and why this method is not static now, is that I wanted to reserve the opportunity to handle SCALAR FMAs and their 1st operand more optimistically later (when additional analysis of scalar FMA users would be implemented); please see the FIXME comment at the line 3487. v_klochkov: Yes, this method could be static and be similar to existing methods like "static bool…
		unsigned SrcOpIdx1,
		unsigned SrcOpIdx2) const {
		int RetOpc = 0;
		int Opc = MI->getOpcode();

		//
		// Struct which describes FMA opcodes and dependencies between them.
		//
		static const struct {
		delenaUnsubmitted Not Done Reply Inline Actions I suggest to separate scalar from vector. You handle them separately, right? I also think that you can put all FMA tables in a separate header file. delena: I suggest to separate scalar from vector. You handle them separately, right? I also think that…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions There is one loop handling all vector and scalar FMAs below, I did not handle them separately. The 'IsScalar' field was needed only to handle the 1st operand with extra carefulness as commuting 1st operand of scalar FMA requires some additional analysis. Regarding the separating FMA tables into a separate header file... Separating it to a header file makes sense only when it would be used by something else, i.e. not only by one method. Otherwise, it is more convenient to have this array definition closer to the function/method using that table. Also, In my opinion the function local/static array OpcodeAlts is written using the same style that was used in several other places in this file (Please see the definition of MemoryFoldTable2Addr, MemoryFoldTable0, etc). Moving all similar static arrays of structures to a header file deserves a special/separate change-set. v_klochkov: There is one loop handling all vector and scalar FMAs below, I did not handle them separately.
		int Opc1;
		int Opc2;
		int Opc3;
		bool IsScalar;
		} OpcodeAlts[] = {
		{ X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r, true },
		{ X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r, true },
		{ X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r, false },
		{ X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r, false },
		{ X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY,false },
		{ X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY,false },
		{ X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m, true },
		{ X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m, true },
		{ X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m, false },
		{ X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m, false },
		{ X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY,false },
		{ X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY,false },

		{ X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r, true },
		{ X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r, true },
		{ X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r, false },
		{ X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r, false },
		{ X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY,false },
		{ X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY,false },
		{ X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m, true },
		{ X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m, true },
		{ X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m, false },
		{ X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m, false },
		{ X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY,false },
		{ X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY,false },

		{ X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r, true },
		{ X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r, true },
		{ X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r, false },
		{ X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r, false },
		{ X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY,false },
		{ X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY,false },
		{ X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m, true },
		{ X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m, true },
		{ X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m, false },
		{ X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m, false },
		{ X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY,false },
		{ X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY,false },

		{ X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r, true },
		{ X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r, true },
		{ X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r, false },
		{ X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r, false },
		{ X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY,false },
		{ X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY,false },
		{ X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m, true },
		{ X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m, true },
		{ X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m, false },
		{ X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m, false },
		{ X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY,false },
		{ X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY,false },

		{ X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r, false },
		{ X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r, false },
		{ X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY,false },
		{ X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY,false },
		{ X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m, false },
		{ X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m, false },
		{ X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY,false },
		{ X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY,false },

		{ X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r, false },
		{ X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r, false },
		{ X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY,false },
		{ X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY,false },
		{ X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m, false },
		{ X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m, false },
		{ X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY,false },
		{ X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY,false }
		};

		unsigned char OpcodeAltsNum = sizeof(OpcodeAlts) / sizeof(OpcodeAlts[0]);
		int i, pos = 0;
		for (i = 0; i < OpcodeAltsNum; i++) {
		if (OpcodeAlts[i].Opc2 == Opc) {
		pos = 2;
		break;
		}
		if (OpcodeAlts[i].Opc1 == Opc) {
		pos = 1;
		break;
		}
		if (OpcodeAlts[i].Opc3 == Opc) {
		pos = 3;
		break;
		}
		}

		//
		// Input opcode does not match with any from the table.
		//
		if (pos == 0)
		return 0;

		// FIXME: Commuting the 1st operand of scalar FMA requires some additional
		// analysis such as getting proof of the fact that all uses of the
		// given FMA instruction use only the lowest element. Without proving
		// that commuting the 1st operand of scalar FMAs changes the upper bits
		// of the result.
		//
		if (OpcodeAlts[i].IsScalar && (SrcOpIdx1 == 1 \|\| SrcOpIdx2 == 1))
		return 0;
		qcolombetUnsubmitted Not Done Reply Inline Actions My understanding is that you are address this point here: (2) Fixed a correctness problem caused by commuting 1st and 2nd operands of scalar FMAs generated for intrinsics. Most of the time I think we do not care about the high level bits of the value (which is what you are fixing here). Therefore, I wonder if we are not being pessimistic on the commutation opportunities. I agree we should seek correctness first, but I wonder how often that high level setting is actually expected… We had this bug forever and apparently nobody noticed it. Anyway, what is your plan to get us the performance back? qcolombet: My understanding is that you are address this point here: > (2) Fixed a correctness problem…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions That correctness problem exists for FMAs and does not exist for ADD/MUL operations. Also, FMAs are relatively new instructions. For example, if you compile the test: #include <immintrin.h> double func(double y, double x) { return y + x; } __m128d funcx(__m128d y, __m128d x) { return _mm_add_sd(x, y); } then you'll see that only 1 instruction is generated for func() and 2 instructions for funcx(). func() just ignores the upper bits of returned XMM and funcx() correctly handles the upper bits of returned XMM value. The difference in IR is: ADDSDrr opcode is used in func(), ADDSDrr_Int opcode is used in funcx(). So, one of possible solutions could be to add _Int opcodes for FMA operations like it was done for ADD and MUL operations, and be more conservative for FMA_Int opcodes only. Another solution is mentioned in FIXME comment above, i.e. to implement functionality that can tell if only the lowest element of the result of scalar FMA is used. In my opinion, these 2 solutions do not exclude each other; they both should be implemented. Currently, we do not have FMA_Int opcodes, that is why it would be better to be more conservative and correct. This patch might make the code a little bit worse/conservative on some corner cases, but it also improves code-gen for many other cases, for example, for those cases where the 1st or 2nd operand can be swapped with 3rd operand when it helps to do memory-op-folding optimization. v_klochkov:* That correctness problem exists for FMAs and does not exist for ADD/MUL operations. Also, FMAs…

		//
		// Find reversed FMA opcode.
		//
		if ((SrcOpIdx1 == 1 && SrcOpIdx2 == 2) \|\|
		(SrcOpIdx1 == 2 && SrcOpIdx2 == 1)) {
		if (pos == 1)
		RetOpc = OpcodeAlts[i].Opc3;
		else if (pos == 2)
		RetOpc = Opc;
		else
		RetOpc = OpcodeAlts[i].Opc1;
		}
		else if ((SrcOpIdx1 == 1 && SrcOpIdx2 == 3) \|\|
		(SrcOpIdx1 == 3 && SrcOpIdx2 == 1)) {
		if (pos == 1)
		RetOpc = Opc;
		else if (pos == 2)
		RetOpc = OpcodeAlts[i].Opc3;
		else
		RetOpc = OpcodeAlts[i].Opc2;
		}
		else if ((SrcOpIdx1 == 2 && SrcOpIdx2 == 3) \|\|
		(SrcOpIdx1 == 3 && SrcOpIdx2 == 2)) {
		if (pos == 1)
		RetOpc = OpcodeAlts[i].Opc2;
		else if (pos == 2)
		RetOpc = OpcodeAlts[i].Opc1;
		else
		RetOpc = Opc;
		}

		return RetOpc;
		}

		/// Returns true iff the routine could find two commutable operands in the
		/// given machine instruction.
		/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
		/// input values can be re-defined in this method only if the input values
		/// are not pre-defined, which is designated by the special value ~0U
		/// assigned to it.
		/// If both of indices are pre-defined and refer to some operands, then the
		/// method simply returns true if the corresponding operands are commutable
		/// and returns false otherwise.
		///
		/// For example, calling this method this way:
		/// unsigned Op1 = 1, Op2 = ~0U;
		/// findCommutedOpIndices(MI, Op1, Op2);
		/// can be interpreted as a query asking to find an operand that would be
		/// commutable with the operand#1.
		///
		bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,
		delenaUnsubmitted Not Done Reply Inline Actions I think this interface is inconvenient. I suggest to separate input and output. You can put ~0U as default value of input. delena: I think this interface is inconvenient. I suggest to separate input and output. You can put ~0U…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Special thank you for this comment! Separating INPUT and OUTPUT arguments seems very reasonable. I like this idea. In my opinion both approaches have right to live though. Before adding 2 additional arguments to findCommutedOpIndices() and fixing other places I would wait for more comments from reviewers. v_klochkov: Special thank you for this comment! Separating INPUT and OUTPUT arguments seems very reasonable.
		qcolombetUnsubmitted Not Done Reply Inline Actions I do not see why it is better to separate the input and output parameters here. As long as the parameter will have the value: CommuteAnyOperandIndex, we know how to make the distinction. qcolombet: I do not see why it is better to separate the input and output parameters here. As long as the…
		unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {		unsigned &SrcOpIdx2) const {
switch (MI->getOpcode()) {		switch (MI->getOpcode()) {
case X86::CMPPDrri:		case X86::CMPPDrri:
case X86::CMPPSrri:		case X86::CMPPSrri:
case X86::VCMPPDrri:		case X86::VCMPPDrri:
case X86::VCMPPSrri:		case X86::VCMPPSrri:
case X86::VCMPPDYrri:		case X86::VCMPPDYrri:
case X86::VCMPPSYrri: {		case X86::VCMPPSYrri: {
// Float comparison can be safely commuted for		// Float comparison can be safely commuted for
// Ordered/Unordered/Equal/NotEqual tests		// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI->getOperand(3).getImm() & 0x7;		unsigned Imm = MI->getOperand(3).getImm() & 0x7;
switch (Imm) {		switch (Imm) {
case 0x00: // EQUAL		case 0x00: // EQUAL
case 0x03: // UNORDERED		case 0x03: // UNORDERED
case 0x04: // NOT EQUAL		case 0x04: // NOT EQUAL
case 0x07: // ORDERED		case 0x07: // ORDERED
SrcOpIdx1 = 1;		// The indices of the commutable operands are 1 and 2.
SrcOpIdx2 = 2;		// Assign them to the returned operand indices here.
return true;		return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
}		}
return false;		return false;
}		}
case X86::VFMADDPDr231r:
case X86::VFMADDPSr231r:
case X86::VFMADDSDr231r:
case X86::VFMADDSSr231r:
case X86::VFMSUBPDr231r:
case X86::VFMSUBPSr231r:
case X86::VFMSUBSDr231r:
case X86::VFMSUBSSr231r:
case X86::VFNMADDPDr231r:
case X86::VFNMADDPSr231r:
case X86::VFNMADDSDr231r:
case X86::VFNMADDSSr231r:
case X86::VFNMSUBPDr231r:
case X86::VFNMSUBPSr231r:
case X86::VFNMSUBSDr231r:
case X86::VFNMSUBSSr231r:
case X86::VFMADDPDr231rY:
case X86::VFMADDPSr231rY:
case X86::VFMSUBPDr231rY:
case X86::VFMSUBPSr231rY:
case X86::VFNMADDPDr231rY:
case X86::VFNMADDPSr231rY:
case X86::VFNMSUBPDr231rY:
case X86::VFNMSUBPSr231rY:
SrcOpIdx1 = 2;
SrcOpIdx2 = 3;
return true;
default:		default:
		if (isFMA3(MI->getOpcode())) {
		return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
		}
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);		return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
}		}

		return false;
}		}

static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {		static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
switch (BrOpc) {		switch (BrOpc) {
default: return X86::COND_INVALID;		default: return X86::COND_INVALID;
case X86::JE_1: return X86::COND_E;		case X86::JE_1: return X86::COND_E;
case X86::JNE_1: return X86::COND_NE;		case X86::JNE_1: return X86::COND_NE;
case X86::JL_1: return X86::COND_L;		case X86::JL_1: return X86::COND_L;
▲ Show 20 Lines • Show All 1,766 Lines • ▼ Show 20 Lines	if (I != OpcodeTablePtr->end()) {
}		}
return NewMI;		return NewMI;
}		}
}		}

// If the instruction and target operand are commutable, commute the		// If the instruction and target operand are commutable, commute the
// instruction and try again.		// instruction and try again.
if (AllowCommute) {		if (AllowCommute) {
unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2;		unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = ~0U;
if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {		if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
bool HasDef = MI->getDesc().getNumDefs();		bool HasDef = MI->getDesc().getNumDefs();
unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;		unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();		unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();		unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
bool Tied0 =
0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
bool Tied1 =		bool Tied1 =
		0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
		bool Tied2 =
0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);		0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);

// If either of the commutable operands are tied to the destination		// If either of the commutable operands are tied to the destination
// then we can not commute + fold.		// then we can not commute + fold.
if ((HasDef && Reg0 == Reg1 && Tied0) \|\|		if ((HasDef && Reg0 == Reg1 && Tied1) \|\|
(HasDef && Reg0 == Reg2 && Tied1))		(HasDef && Reg0 == Reg2 && Tied2))
return nullptr;		return nullptr;

if ((CommuteOpIdx1 == OriginalOpIdx) \|\|		MachineInstr *CommutedMI = commuteInstruction(MI, false,
(CommuteOpIdx2 == OriginalOpIdx)) {		CommuteOpIdx1,
MachineInstr *CommutedMI = commuteInstruction(MI, false);		CommuteOpIdx2);
if (!CommutedMI) {		if (!CommutedMI) {
// Unable to commute.		// Unable to commute.
return nullptr;		return nullptr;
}		}
if (CommutedMI != MI) {		if (CommutedMI != MI) {
// New instruction. We can't fold from this.		// New instruction. We can't fold from this.
CommutedMI->eraseFromParent();		CommutedMI->eraseFromParent();
return nullptr;		return nullptr;
}		}

// Attempt to fold with the commuted version of the instruction.		// Attempt to fold with the commuted version of the instruction.
unsigned CommuteOp =		NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
(CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);		Size, Align, /AllowCommute=/false);
NewMI =
foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, InsertPt, Size, Align,
/AllowCommute=/false);
if (NewMI)		if (NewMI)
return NewMI;		return NewMI;

// Folding failed again - undo the commute before returning.		// Folding failed again - undo the commute before returning.
MachineInstr *UncommutedMI = commuteInstruction(MI, false);		MachineInstr *UncommutedMI = commuteInstruction(MI, false,
		CommuteOpIdx1,
		CommuteOpIdx2);
if (!UncommutedMI) {		if (!UncommutedMI) {
// Unable to commute.		// Unable to commute.
return nullptr;		return nullptr;
}		}
if (UncommutedMI != MI) {		if (UncommutedMI != MI) {
// New instruction. It doesn't need to be kept.		// New instruction. It doesn't need to be kept.
UncommutedMI->eraseFromParent();		UncommutedMI->eraseFromParent();
return nullptr;		return nullptr;
}		}

// Return here to prevent duplicate fuse failure report.		// Return here to prevent duplicate fuse failure report.
return nullptr;		return nullptr;
}		}
}		}
}

// No fusion		// No fusion
if (PrintFailedFusing && !MI->isCopy())		if (PrintFailedFusing && !MI->isCopy())
dbgs() << "We failed to fuse operand " << OpNum << " in " << *MI;		dbgs() << "We failed to fuse operand " << OpNum << " in " << *MI;
return nullptr;		return nullptr;
}		}

/// Return true for all instructions that only update		/// Return true for all instructions that only update
▲ Show 20 Lines • Show All 1,673 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/fma-commute-x86.ll

				; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 \| FileCheck %s
				; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 \| FileCheck %s
				; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 \| FileCheck %s

				declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
				define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fmadd132ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fmadd231ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fmadd213ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
				define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fmadd132ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fmadd231ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fmadd213ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
				define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fmadd132pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fmadd_aba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fmadd231pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fmadd_bba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fmadd213pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
				define <4 x double> @test_x86_fmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fmadd132pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fmadd231pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fmadd213pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}



				declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
				define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fnmadd132ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fnmadd231ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fnmadd213ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
				define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fnmadd132ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fnmadd231ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fnmadd213ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
				define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fnmadd132pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fnmadd_aba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fnmadd231pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fnmadd_bba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fnmadd213pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
				define <4 x double> @test_x86_fnmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fnmadd132pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fnmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fnmadd231pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fnmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fnmadd213pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}


				declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
				define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fmsub132ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fmsub231ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fmsub213ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
				define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fmsub132ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fmsub231ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fmsub213ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
				define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fmsub132pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fmsub_aba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fmsub231pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fmsub_bba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fmsub213pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
				define <4 x double> @test_x86_fmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fmsub132pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fmsub231pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fmsub213pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}


				declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
				define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fnmsub132ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fnmsub231ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fnmsub213ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
				define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fnmsub132ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fnmsub231ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fnmsub213ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
				define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fnmsub132pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fnmsub_aba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fnmsub231pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fnmsub_bba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fnmsub213pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
				define <4 x double> @test_x86_fnmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fnmsub132pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fnmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fnmsub231pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fnmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fnmsub213pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

llvm/test/CodeGen/X86/fma_patterns.ll

	Show First 20 Lines • Show All 128 Lines • ▼ Show 20 Lines
	; CHECK_FMA4: ret			; CHECK_FMA4: ret
	define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {			define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
	%x = fmul <2 x double> %a0, %a1			%x = fmul <2 x double> %a0, %a1
	%res = fsub <2 x double> %x, %a2			%res = fsub <2 x double> %x, %a2
	ret <2 x double> %res			ret <2 x double> %res
	}			}

	; CHECK: test_x86_fnmadd_ss			; CHECK: test_x86_fnmadd_ss
	; CHECK: vfnmadd213ss %xmm2, %xmm1, %xmm0			; CHECK: vfnmadd213ss %xmm2, %xmm0, %xmm1
	; CHECK: ret			; CHECK: ret
	; CHECK_FMA4: test_x86_fnmadd_ss			; CHECK_FMA4: test_x86_fnmadd_ss
	; CHECK_FMA4: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0			; CHECK_FMA4: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
	; CHECK_FMA4: ret			; CHECK_FMA4: ret
	define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {			define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
	%x = fmul float %a0, %a1			%x = fmul float %a0, %a1
	%res = fsub float %a2, %x			%res = fsub float %a2, %x
	ret float %res			ret float %res
	}			}

	; CHECK: test_x86_fnmadd_sd			; CHECK: test_x86_fnmadd_sd
	; CHECK: vfnmadd213sd %xmm2, %xmm1, %xmm0			; CHECK: vfnmadd213sd %xmm2, %xmm0, %xmm1
	; CHECK: ret			; CHECK: ret
	; CHECK_FMA4: test_x86_fnmadd_sd			; CHECK_FMA4: test_x86_fnmadd_sd
	; CHECK_FMA4: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0			; CHECK_FMA4: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
	; CHECK_FMA4: ret			; CHECK_FMA4: ret
	define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {			define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
	%x = fmul double %a0, %a1			%x = fmul double %a0, %a1
	%res = fsub double %a2, %x			%res = fsub double %a2, %x
	ret double %res			ret double %res
	}			}

	; CHECK: test_x86_fmsub_sd			; CHECK: test_x86_fmsub_sd
	; CHECK: vfmsub213sd %xmm2, %xmm1, %xmm0			; CHECK: vfmsub213sd %xmm2, %xmm0, %xmm1
	; CHECK: ret			; CHECK: ret
	; CHECK_FMA4: test_x86_fmsub_sd			; CHECK_FMA4: test_x86_fmsub_sd
	; CHECK_FMA4: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0			; CHECK_FMA4: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
	; CHECK_FMA4: ret			; CHECK_FMA4: ret
	define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {			define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
	%x = fmul double %a0, %a1			%x = fmul double %a0, %a1
	%res = fsub double %x, %a2			%res = fsub double %x, %a2
	ret double %res			ret double %res
	}			}

	; CHECK: test_x86_fnmsub_ss			; CHECK: test_x86_fnmsub_ss
	; CHECK: vfnmsub213ss %xmm2, %xmm1, %xmm0			; CHECK: vfnmsub213ss %xmm2, %xmm0, %xmm1
	; CHECK: ret			; CHECK: ret
	; CHECK_FMA4: test_x86_fnmsub_ss			; CHECK_FMA4: test_x86_fnmsub_ss
	; CHECK_FMA4: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0			; CHECK_FMA4: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
	; CHECK_FMA4: ret			; CHECK_FMA4: ret
	define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {			define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
	%x = fsub float -0.000000e+00, %a0			%x = fsub float -0.000000e+00, %a0
	%y = fmul float %x, %a1			%y = fmul float %x, %a1
	%res = fsub float %y, %a2			%res = fsub float %y, %a2
	ret float %res			ret float %res
	}			}

	; CHECK: test_x86_fmadd_ps_load			; CHECK: test_x86_fmadd_ps_load
	; CHECK: vmovaps (%rdi), %xmm2			; CHECK: vfmadd132ps (%rdi), %xmm1, %xmm0
	; CHECK: vfmadd213ps %xmm1, %xmm2, %xmm0
	; CHECK: ret			; CHECK: ret
	; CHECK_FMA4: test_x86_fmadd_ps_load			; CHECK_FMA4: test_x86_fmadd_ps_load
	; CHECK_FMA4: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0			; CHECK_FMA4: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
	; CHECK_FMA4: ret			; CHECK_FMA4: ret
	define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
	%x = load <4 x float>, <4 x float>* %a0			%x = load <4 x float>, <4 x float>* %a0
	%y = fmul <4 x float> %x, %a1			%y = fmul <4 x float> %x, %a1
	%res = fadd <4 x float> %y, %a2			%res = fadd <4 x float> %y, %a2
	ret <4 x float> %res			ret <4 x float> %res
	}			}

	; CHECK: test_x86_fmsub_ps_load			; CHECK: test_x86_fmsub_ps_load
	; CHECK: vmovaps (%rdi), %xmm2			; CHECK: vfmsub132ps (%rdi), %xmm1, %xmm0
	; CHECK: fmsub213ps %xmm1, %xmm2, %xmm0
	; CHECK: ret			; CHECK: ret
	; CHECK_FMA4: test_x86_fmsub_ps_load			; CHECK_FMA4: test_x86_fmsub_ps_load
	; CHECK_FMA4: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0			; CHECK_FMA4: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0
	; CHECK_FMA4: ret			; CHECK_FMA4: ret
	define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
	%x = load <4 x float>, <4 x float>* %a0			%x = load <4 x float>, <4 x float>* %a0
	%y = fmul <4 x float> %x, %a1			%y = fmul <4 x float> %x, %a1
	%res = fsub <4 x float> %y, %a2			%res = fsub <4 x float> %y, %a2
	ret <4 x float> %res			ret <4 x float> %res
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

Improved the interface of methods commuting operands, improved X86-FMA3 mem-folding&coalescing.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 30193

llvm/include/llvm/Target/TargetInstrInfo.h

llvm/lib/CodeGen/RegisterCoalescer.cpp

llvm/lib/CodeGen/TargetInstrInfo.cpp

llvm/lib/CodeGen/TwoAddressInstructionPass.cpp

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

llvm/lib/Target/AMDGPU/SIInstrInfo.h

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

llvm/lib/Target/ARM/ARMBaseInstrInfo.h

llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp

llvm/lib/Target/ARM/Thumb2SizeReduction.cpp

llvm/lib/Target/PowerPC/PPCInstrInfo.h

llvm/lib/Target/PowerPC/PPCInstrInfo.cpp

llvm/lib/Target/X86/X86InstrFMA.td

llvm/lib/Target/X86/X86InstrInfo.h

llvm/lib/Target/X86/X86InstrInfo.cpp

llvm/test/CodeGen/X86/fma-commute-x86.ll

llvm/test/CodeGen/X86/fma_patterns.ll

Improved the interface of methods commuting operands, improved X86-FMA3 mem-folding&coalescing.
ClosedPublic