Index: lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.h
+++ lib/Target/AArch64/AArch64ISelLowering.h
@@ -670,6 +670,9 @@
   bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override;
 
   void finalizeLowering(MachineFunction &MF) const override;
+
+  void AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                     SDNode *Node) const override;
 };
 
 namespace AArch64 {
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11422,3 +11422,13 @@
   MF.getFrameInfo().computeMaxCallFrameSize(MF);
   TargetLoweringBase::finalizeLowering(MF);
 }
+
+void
+AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                                     SDNode *Node) const {
+  assert(MI.getOpcode() == AArch64::EXTRACT_SUBREG_BYTE);
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  const MCInstrDesc &MCID = TII->get(TargetOpcode::COPY);
+  MI.setDesc(MCID);
+  MI.getOperand(1).setSubReg(AArch64::bsub);
+}
Index: lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.td
+++ lib/Target/AArch64/AArch64InstrInfo.td
@@ -2369,6 +2369,19 @@
                  GPR64sp:$Rn, IndexType:$offset)>;
 }
 
+// Lowering byte stores is a bit unusual: we can't use EXTRACT_SUBREG because
+// it can't figure out the right register class.
+// FIXME: Instead of a target-specific node, make a target-independent
+// EXTRACT_SUBREG_TO_REGCLASS.
+let hasNoSchedulingInfo = 1, hasPostISelHook = 1 in
+def EXTRACT_SUBREG_BYTE : Pseudo<(outs FPR8:$Rd), (ins VecListOne128:$Rn), []>;
+let AddedComplexity = 19 in {
+  def : Pat<(truncstorei8 (i32 (vector_extract (v16i8 VecListOne128:$Vt), 0)),
+                          (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
+            (STRBui (EXTRACT_SUBREG_BYTE VecListOne128:$Vt),
+                    GPR64sp:$Rn, uimm12s1:$offset)>;
+}
+
 let AddedComplexity = 19 in {
   defm : VecStoreLane0Pat<am_indexed16, truncstorei16, v8i16, i32, hsub, uimm12s2, STRHui>;
   defm : VecStoreLane0Pat<am_indexed16,         store, v8f16, f16, hsub, uimm12s2, STRHui>;
Index: test/CodeGen/AArch64/arm64-collect-loh.ll
===================================================================
--- test/CodeGen/AArch64/arm64-collect-loh.ll
+++ test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -614,11 +614,10 @@
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
 ; CHECK-NEXT: ; kill
-; Ultimately we should generate str b0, but right now, we match the vector
-; variant which does not allow to fold the immediate into the store.
-; CHECK-NEXT: st1.b { v0 }[0], {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str b0, {{\[}}[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
-; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setL(<1 x i8> %t) {
   store <1 x i8> %t, <1 x i8>* @L, align 4
   ret void
Index: test/CodeGen/AArch64/arm64-st1.ll
===================================================================
--- test/CodeGen/AArch64/arm64-st1.ll
+++ test/CodeGen/AArch64/arm64-st1.ll
@@ -12,7 +12,7 @@
 
 define void @st1lane0_16b(<16 x i8> %A, i8* %D) {
 ; CHECK-LABEL: st1lane0_16b
-; CHECK: st1.b
+; CHECK: str
   %tmp = extractelement <16 x i8> %A, i32 0
   store i8 %tmp, i8* %D
   ret void
@@ -40,7 +40,7 @@
 define void @st1lane0_ro_16b(<16 x i8> %A, i8* %D, i64 %offset) {
 ; CHECK-LABEL: st1lane0_ro_16b
 ; CHECK: add x[[XREG:[0-9]+]], x0, x1
-; CHECK: st1.b { v0 }[0], [x[[XREG]]]
+; CHECK: str b0, [x[[XREG]]]
   %ptr = getelementptr i8, i8* %D, i64 %offset
   %tmp = extractelement <16 x i8> %A, i32 0
   store i8 %tmp, i8* %ptr
@@ -288,7 +288,7 @@
 define void @st1lane0_ro_8b(<8 x i8> %A, i8* %D, i64 %offset) {
 ; CHECK-LABEL: st1lane0_ro_8b
 ; CHECK: add x[[XREG:[0-9]+]], x0, x1
-; CHECK: st1.b { v0 }[0], [x[[XREG]]]
+; CHECK: str b0, [x[[XREG]]]
   %ptr = getelementptr i8, i8* %D, i64 %offset
   %tmp = extractelement <8 x i8> %A, i32 0
   store i8 %tmp, i8* %ptr