Index: llvm/lib/CodeGen/TwoAddressInstructionPass.cpp =================================================================== --- llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1822,15 +1822,46 @@ // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form. if (mi->isInsertSubreg()) { - // From %reg = INSERT_SUBREG %reg, %subreg, subidx - // To %reg:subidx = COPY %subreg - unsigned SubIdx = mi->getOperand(3).getImm(); - mi->removeOperand(3); - assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx"); - mi->getOperand(0).setSubReg(SubIdx); - mi->getOperand(0).setIsUndef(mi->getOperand(1).isUndef()); - mi->removeOperand(1); - mi->setDesc(TII->get(TargetOpcode::COPY)); + Register DstReg = mi->getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI->getRegClass(DstReg); + MachineInstr *SrcMI = MRI->getUniqueVRegDef(mi->getOperand(2).getReg()); + + // When you use the 32-bit form of an instruction, the upper 32 bits of + // the source registers are ignored and the upper 32 bits of the + // destination register are set to zero. + // + // If Target's 32-bit form of instruction defines the source operand of + // zero-extend, we do not need the zero-extend. Let's check the MI's + // opcode is real instruction and if it is not, do not process + // the opcode conservatively. + // Check of isTypeLegalForClass to avoid update too many test cases. + if (!SrcMI || (SrcMI->getOpcode() == TargetOpcode::COPY) || + (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) || + !TRI->isTypeLegalForClass(*RC, MVT::i64) || + TRI->isTypeLegalForClass(*RC, MVT::f64)) { + // From %reg = INSERT_SUBREG %reg, %subreg, subidx + // To %reg:subidx = COPY %subreg + unsigned SubIdx = mi->getOperand(3).getImm(); + mi->removeOperand(3); + assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx"); + mi->getOperand(0).setSubReg(SubIdx); + mi->getOperand(0).setIsUndef(mi->getOperand(1).isUndef()); + mi->removeOperand(1); + mi->setDesc(TII->get(TargetOpcode::COPY)); + } else { + // From %reg = INSERT_SUBREG %reg, %subreg, subidx + // To %reg = SUBREG_TO_REG 0, %subreg, subidx + // Build a SUBREG_TO_REG instruction + MachineInstr *SubregMI = + BuildMI(*mi->getParent(), *mi, mi->getDebugLoc(), + TII->get(TargetOpcode::SUBREG_TO_REG), DstReg) + .addImm(0) + .add(mi->getOperand(2)) + .add(mi->getOperand(3)); + mi->eraseFromParent(); + mi = SubregMI; + } + LLVM_DEBUG(dbgs() << "\t\tconvert to:\t" << *mi); // Update LiveIntervals. Index: llvm/test/CodeGen/AArch64/atomicrmw-O0.ll =================================================================== --- llvm/test/CodeGen/AArch64/atomicrmw-O0.ll +++ llvm/test/CodeGen/AArch64/atomicrmw-O0.ll @@ -528,9 +528,9 @@ ; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload ; NOLSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload ; NOLSE-NEXT: mov w8, w9 -; NOLSE-NEXT: mvn w10, w8 -; NOLSE-NEXT: // implicit-def: $x8 -; NOLSE-NEXT: mov w8, w10 +; NOLSE-NEXT: mvn w8, w8 +; NOLSE-NEXT: // implicit-def: $x10 +; NOLSE-NEXT: // kill: def $x8 killed $w8 ; NOLSE-NEXT: orr x12, x8, #0xfffffffffffffffe ; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; NOLSE-NEXT: // Parent Loop BB8_1 Depth=1 @@ -569,9 +569,9 @@ ; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload ; LSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload ; LSE-NEXT: mov w8, w9 -; LSE-NEXT: mvn w10, w8 -; LSE-NEXT: // implicit-def: $x8 -; LSE-NEXT: mov w8, w10 +; LSE-NEXT: mvn w8, w8 +; LSE-NEXT: // implicit-def: $x10 +; LSE-NEXT: // kill: def $x8 killed $w8 ; LSE-NEXT: orr x10, x8, #0xfffffffffffffffe ; LSE-NEXT: mov x8, x9 ; LSE-NEXT: casal x8, x10, [x11] @@ -609,9 +609,9 @@ ; NOLSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload ; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload ; NOLSE-NEXT: mov w9, w8 -; NOLSE-NEXT: mvn w10, w9 -; NOLSE-NEXT: // implicit-def: $x9 -; NOLSE-NEXT: mov w9, w10 +; NOLSE-NEXT: mvn w9, w9 +; NOLSE-NEXT: // implicit-def: $x10 +; NOLSE-NEXT: // kill: def $x9 killed $w9 ; NOLSE-NEXT: orr x14, x9, #0xfffffffffffffffe ; NOLSE-NEXT: mov x15, #-1 ; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start @@ -667,9 +667,9 @@ ; LSE-NEXT: mov x0, x8 ; LSE-NEXT: mov x1, x10 ; LSE-NEXT: mov w11, w8 -; LSE-NEXT: mvn w12, w11 -; LSE-NEXT: // implicit-def: $x11 -; LSE-NEXT: mov w11, w12 +; LSE-NEXT: mvn w11, w11 +; LSE-NEXT: // implicit-def: $x12 +; LSE-NEXT: // kill: def $x11 killed $w11 ; LSE-NEXT: orr x2, x11, #0xfffffffffffffffe ; LSE-NEXT: mov x11, #-1 ; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3 Index: llvm/test/CodeGen/X86/bitcast-setcc-128.ll =================================================================== --- llvm/test/CodeGen/X86/bitcast-setcc-128.ll +++ llvm/test/CodeGen/X86/bitcast-setcc-128.ll @@ -517,8 +517,8 @@ ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %ecx -; AVX2-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; AVX2-NEXT: orq %rcx, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq