Index: llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
===================================================================
--- llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1822,15 +1822,46 @@
 
       // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
       if (mi->isInsertSubreg()) {
-        // From %reg = INSERT_SUBREG %reg, %subreg, subidx
-        // To   %reg:subidx = COPY %subreg
-        unsigned SubIdx = mi->getOperand(3).getImm();
-        mi->removeOperand(3);
-        assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
-        mi->getOperand(0).setSubReg(SubIdx);
-        mi->getOperand(0).setIsUndef(mi->getOperand(1).isUndef());
-        mi->removeOperand(1);
-        mi->setDesc(TII->get(TargetOpcode::COPY));
+        Register DstReg = mi->getOperand(0).getReg();
+        const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
+        MachineInstr *SrcMI = MRI->getUniqueVRegDef(mi->getOperand(2).getReg());
+
+        // When you use the 32-bit form of an instruction, the upper 32 bits of
+        // the source registers are ignored and the upper 32 bits of the
+        // destination register are set to zero.
+        //
+        // If Target's 32-bit form of instruction defines the source operand of
+        // zero-extend, we do not need the zero-extend. Let's check the MI's
+        // opcode is real instruction and if it is not, do not process
+        // the opcode conservatively.
+        // Check of isTypeLegalForClass to avoid update too many test cases.
+        if (!SrcMI || (SrcMI->getOpcode() == TargetOpcode::COPY) ||
+            (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||
+            !TRI->isTypeLegalForClass(*RC, MVT::i64) ||
+            TRI->isTypeLegalForClass(*RC, MVT::f64)) {
+          // From %reg = INSERT_SUBREG %reg, %subreg, subidx
+          // To   %reg:subidx = COPY %subreg
+          unsigned SubIdx = mi->getOperand(3).getImm();
+          mi->removeOperand(3);
+          assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
+          mi->getOperand(0).setSubReg(SubIdx);
+          mi->getOperand(0).setIsUndef(mi->getOperand(1).isUndef());
+          mi->removeOperand(1);
+          mi->setDesc(TII->get(TargetOpcode::COPY));
+        } else {
+          // From %reg = INSERT_SUBREG %reg, %subreg, subidx
+          // To   %reg = SUBREG_TO_REG 0, %subreg, subidx
+          // Build a SUBREG_TO_REG instruction
+          MachineInstr *SubregMI =
+              BuildMI(*mi->getParent(), *mi, mi->getDebugLoc(),
+                      TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)
+                  .addImm(0)
+                  .add(mi->getOperand(2))
+                  .add(mi->getOperand(3));
+          mi->eraseFromParent();
+          mi = SubregMI;
+        }
+
         LLVM_DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
 
         // Update LiveIntervals.
Index: llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
===================================================================
--- llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
+++ llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
@@ -528,9 +528,9 @@
 ; NOLSE-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
 ; NOLSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
 ; NOLSE-NEXT:    mov w8, w9
-; NOLSE-NEXT:    mvn w10, w8
-; NOLSE-NEXT:    // implicit-def: $x8
-; NOLSE-NEXT:    mov w8, w10
+; NOLSE-NEXT:    mvn w8, w8
+; NOLSE-NEXT:    // implicit-def: $x10
+; NOLSE-NEXT:    // kill: def $x8 killed $w8
 ; NOLSE-NEXT:    orr x12, x8, #0xfffffffffffffffe
 ; NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; NOLSE-NEXT:    // Parent Loop BB8_1 Depth=1
@@ -569,9 +569,9 @@
 ; LSE-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
 ; LSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
 ; LSE-NEXT:    mov w8, w9
-; LSE-NEXT:    mvn w10, w8
-; LSE-NEXT:    // implicit-def: $x8
-; LSE-NEXT:    mov w8, w10
+; LSE-NEXT:    mvn w8, w8
+; LSE-NEXT:    // implicit-def: $x10
+; LSE-NEXT:    // kill: def $x8 killed $w8
 ; LSE-NEXT:    orr x10, x8, #0xfffffffffffffffe
 ; LSE-NEXT:    mov x8, x9
 ; LSE-NEXT:    casal x8, x10, [x11]
@@ -609,9 +609,9 @@
 ; NOLSE-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
 ; NOLSE-NEXT:    ldr x13, [sp, #24] // 8-byte Folded Reload
 ; NOLSE-NEXT:    mov w9, w8
-; NOLSE-NEXT:    mvn w10, w9
-; NOLSE-NEXT:    // implicit-def: $x9
-; NOLSE-NEXT:    mov w9, w10
+; NOLSE-NEXT:    mvn w9, w9
+; NOLSE-NEXT:    // implicit-def: $x10
+; NOLSE-NEXT:    // kill: def $x9 killed $w9
 ; NOLSE-NEXT:    orr x14, x9, #0xfffffffffffffffe
 ; NOLSE-NEXT:    mov x15, #-1
 ; NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
@@ -667,9 +667,9 @@
 ; LSE-NEXT:    mov x0, x8
 ; LSE-NEXT:    mov x1, x10
 ; LSE-NEXT:    mov w11, w8
-; LSE-NEXT:    mvn w12, w11
-; LSE-NEXT:    // implicit-def: $x11
-; LSE-NEXT:    mov w11, w12
+; LSE-NEXT:    mvn w11, w11
+; LSE-NEXT:    // implicit-def: $x12
+; LSE-NEXT:    // kill: def $x11 killed $w11
 ; LSE-NEXT:    orr x2, x11, #0xfffffffffffffffe
 ; LSE-NEXT:    mov x11, #-1
 ; LSE-NEXT:    // kill: def $x2 killed $x2 def $x2_x3
Index: llvm/test/CodeGen/X86/bitcast-setcc-128.ll
===================================================================
--- llvm/test/CodeGen/X86/bitcast-setcc-128.ll
+++ llvm/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -517,8 +517,8 @@
 ; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
-; AVX2-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; AVX2-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
 ; AVX2-NEXT:    orq %rcx, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq