Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -670,6 +670,9 @@ bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override; void finalizeLowering(MachineFunction &MF) const override; + + void AdjustInstrPostInstrSelection(MachineInstr &MI, + SDNode *Node) const override; }; namespace AArch64 { Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11422,3 +11422,13 @@ MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } + +void +AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, + SDNode *Node) const { + assert(MI.getOpcode() == AArch64::EXTRACT_SUBREG_BYTE); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const MCInstrDesc &MCID = TII->get(TargetOpcode::COPY); + MI.setDesc(MCID); + MI.getOperand(1).setSubReg(AArch64::bsub); +} Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -2369,6 +2369,19 @@ GPR64sp:$Rn, IndexType:$offset)>; } +// Lowering byte stores is a bit unusual: we can't use EXTRACT_SUBREG because +// it can't figure out the right register class. +// FIXME: Instead of a target-specific node, make a target-independent +// EXTRACT_SUBREG_TO_REGCLASS. +let hasNoSchedulingInfo = 1, hasPostISelHook = 1 in +def EXTRACT_SUBREG_BYTE : Pseudo<(outs FPR8:$Rd), (ins VecListOne128:$Rn), []>; +let AddedComplexity = 19 in { + def : Pat<(truncstorei8 (i32 (vector_extract (v16i8 VecListOne128:$Vt), 0)), + (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)), + (STRBui (EXTRACT_SUBREG_BYTE VecListOne128:$Vt), + GPR64sp:$Rn, uimm12s1:$offset)>; +} + let AddedComplexity = 19 in { defm : VecStoreLane0Pat; defm : VecStoreLane0Pat; Index: test/CodeGen/AArch64/arm64-collect-loh.ll =================================================================== --- test/CodeGen/AArch64/arm64-collect-loh.ll +++ test/CodeGen/AArch64/arm64-collect-loh.ll @@ -614,11 +614,10 @@ ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: ; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF] ; CHECK-NEXT: ; kill -; Ultimately we should generate str b0, but right now, we match the vector -; variant which does not allow to fold the immediate into the store. -; CHECK-NEXT: st1.b { v0 }[0], {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: str b0, {{\[}}[[LDRGOT_REG]]] ; CHECK-NEXT: ret -; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]] +; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]] define void @setL(<1 x i8> %t) { store <1 x i8> %t, <1 x i8>* @L, align 4 ret void Index: test/CodeGen/AArch64/arm64-st1.ll =================================================================== --- test/CodeGen/AArch64/arm64-st1.ll +++ test/CodeGen/AArch64/arm64-st1.ll @@ -12,7 +12,7 @@ define void @st1lane0_16b(<16 x i8> %A, i8* %D) { ; CHECK-LABEL: st1lane0_16b -; CHECK: st1.b +; CHECK: str %tmp = extractelement <16 x i8> %A, i32 0 store i8 %tmp, i8* %D ret void @@ -40,7 +40,7 @@ define void @st1lane0_ro_16b(<16 x i8> %A, i8* %D, i64 %offset) { ; CHECK-LABEL: st1lane0_ro_16b ; CHECK: add x[[XREG:[0-9]+]], x0, x1 -; CHECK: st1.b { v0 }[0], [x[[XREG]]] +; CHECK: str b0, [x[[XREG]]] %ptr = getelementptr i8, i8* %D, i64 %offset %tmp = extractelement <16 x i8> %A, i32 0 store i8 %tmp, i8* %ptr @@ -288,7 +288,7 @@ define void @st1lane0_ro_8b(<8 x i8> %A, i8* %D, i64 %offset) { ; CHECK-LABEL: st1lane0_ro_8b ; CHECK: add x[[XREG:[0-9]+]], x0, x1 -; CHECK: st1.b { v0 }[0], [x[[XREG]]] +; CHECK: str b0, [x[[XREG]]] %ptr = getelementptr i8, i8* %D, i64 %offset %tmp = extractelement <8 x i8> %A, i32 0 store i8 %tmp, i8* %ptr