Index: lib/Target/ARM/ARMScheduleSwift.td =================================================================== --- lib/Target/ARM/ARMScheduleSwift.td +++ lib/Target/ARM/ARMScheduleSwift.td @@ -37,1050 +37,12 @@ // FIXME: Add preload instruction when it is documented. // FIXME: Model non-pipelined nature of FP div / sqrt unit. -def SwiftItineraries : ProcessorItineraries< - [SW_DIS0, SW_DIS1, SW_DIS2, SW_ALU0, SW_ALU1, SW_LS, SW_IDIV, SW_FDIV], [], [ - // - // Move instructions, unconditional - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData, - InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [3]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_LS]>], - [5]>, - // - // MVN instructions - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - // - // No operand cycles - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>]>, - // - // Binary Instructions that produce a result - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1]>, - // - // Bitwise Instructions that produce a result - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1]>, - // - // Unary Instructions that produce a result - - // CLZ, RBIT, etc. - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - - // BFC, BFI, UBFX, SBFX - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1]>, - - // - // Zero and sign extension instructions - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1, 1, 1]>, - // - // Compare instructions - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData, - InstrStage<2, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData, - InstrStage<2, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - // - // Test instructions - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData, - InstrStage<2, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData, - InstrStage<2, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - // - // Move instructions, conditional - // FIXME: Correctly model the extra input dep on the destination. - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2]>, - - // Integer multiply pipeline - // - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [3, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [3, 1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_ALU0], 3>, - InstrStage<1, [SW_ALU0]>], - [5, 5, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [5, 6, 1, 1]>, - // - // Integer divide - InstrItinData, - InstrStage<1, [SW_ALU0], 0>, - InstrStage<14, [SW_IDIV]>], - [14, 1, 1]>, - - // Integer load pipeline - // FIXME: The timings are some rough approximations - // - // Immediate offset - InstrItinData, - InstrStage<1, [SW_LS]>], - [3, 1]>, - InstrItinData, - InstrStage<1, [SW_LS]>], - [3, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_LS]>], - [3, 4, 1]>, - // - // Register offset - InstrItinData, - InstrStage<1, [SW_LS]>], - [3, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_LS]>], - [3, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [3, 4, 1, 1]>, - // - // Scaled register offset - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS]>], - [5, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS]>], - [5, 1, 1]>, - // - // Immediate offset with update - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [3, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [3, 1, 1]>, - // - // Register offset with update - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_LS]>], - [3, 1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_LS]>], - [3, 1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [3, 4, 1, 1]>, - // - // Scaled register offset with update - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [5, 3, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [5, 3, 1, 1]>, - // - // Load multiple, def is the 5th operand. - // FIXME: This assumes 3 to 4 registers. - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1, 3], [], -1>, // dynamic uops - - // - // Load multiple + update, defs are the 1st and 5th operands. - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1, 3], [], -1>, // dynamic uops - // - // Load multiple plus branch - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1, 3], [], -1>, // dynamic uops - // - // Pop, def is the 3rd operand. - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 3], [], -1>, // dynamic uops - // - // Pop + branch, def is the 3rd operand. - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 3], [], -1>, // dynamic uops - - // - // iLoadi + iALUr for t2LDRpci_pic. - InstrItinData, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [4, 1]>, - - // Integer store pipeline - /// - // Immediate offset - InstrItinData, - InstrStage<1, [SW_LS]>], - [1, 1]>, - InstrItinData, - InstrStage<1, [SW_LS]>], - [1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1]>, - // - // Register offset - InstrItinData, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - // - // Scaled register offset - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - // - // Immediate offset with update - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - // - // Register offset with update - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1]>, - // - // Scaled register offset with update - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>], - [3, 1, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>], - [3, 1, 1, 1]>, - // - // Store multiple - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [], [], -1>, // dynamic uops - // - // Store multiple + update - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [2], [], -1>, // dynamic uops - - // - // Preload - InstrItinData], [1, 1]>, - - // Branch - // - // no delay slots, so the latency of a branch is unimportant - InstrItinData]>, - - // FP Special Register to Integer Register File Move - InstrItinData, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - // - // Single-precision FP Unary - // - // Most floating-point moves get issued on ALU0. - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - // - // Double-precision FP Unary - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - - // - // Single-precision FP Compare - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [1, 1]>, - // - // Double-precision FP Compare - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [1, 1]>, - // - // Single to Double FP Convert - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Double to Single FP Convert - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - - // - // Single to Half FP Convert - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU1], 4>, - InstrStage<1, [SW_ALU1]>], - [6, 1]>, - // - // Half to Single FP Convert - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - - // - // Single-Precision FP to Integer Convert - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Double-Precision FP to Integer Convert - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Integer to Single-Precision FP Convert - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Integer to Double-Precision FP Convert - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Single-precision FP ALU - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-precision FP ALU - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Single-precision FP Multiply - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Double-precision FP Multiply - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [6, 1, 1]>, - // - // Single-precision FP MAC - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-precision FP MAC - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [12, 1, 1]>, - // - // Single-precision Fused FP MAC - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-precision Fused FP MAC - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [12, 1, 1]>, - // - // Single-precision FP DIV - InstrItinData, - InstrStage<1, [SW_ALU1], 0>, - InstrStage<15, [SW_FDIV]>], - [17, 1, 1]>, - // - // Double-precision FP DIV - InstrItinData, - InstrStage<1, [SW_ALU1], 0>, - InstrStage<30, [SW_FDIV]>], - [32, 1, 1]>, - // - // Single-precision FP SQRT - InstrItinData, - InstrStage<1, [SW_ALU1], 0>, - InstrStage<15, [SW_FDIV]>], - [17, 1]>, - // - // Double-precision FP SQRT - InstrItinData, - InstrStage<1, [SW_ALU1], 0>, - InstrStage<30, [SW_FDIV]>], - [32, 1, 1]>, - - // - // Integer to Single-precision Move - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0]>], - [6, 1]>, - // - // Integer to Double-precision Move - InstrItinData, - InstrStage<1, [SW_LS]>], - [4, 1]>, - // - // Single-precision to Integer Move - InstrItinData, - InstrStage<1, [SW_LS]>], - [3, 1]>, - // - // Double-precision to Integer Move - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_LS]>], - [3, 4, 1]>, - // - // Single-precision FP Load - InstrItinData, - InstrStage<1, [SW_LS]>], - [4, 1]>, - // - // Double-precision FP Load - InstrItinData, - InstrStage<1, [SW_LS]>], - [4, 1]>, - // - // FP Load Multiple - // FIXME: Assumes a single Q register. - InstrItinData, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 4], [], -1>, // dynamic uops - // - // FP Load Multiple + update - // FIXME: Assumes a single Q register. - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1, 4], [], -1>, // dynamic uops - // - // Single-precision FP Store - InstrItinData, - InstrStage<1, [SW_LS]>], - [1, 1]>, - // - // Double-precision FP Store - InstrItinData, - InstrStage<1, [SW_LS]>], - [1, 1]>, - // - // FP Store Multiple - // FIXME: Assumes a single Q register. - InstrItinData, - InstrStage<1, [SW_LS]>], - [1, 1, 1], [], -1>, // dynamic uops - // - // FP Store Multiple + update - // FIXME: Assumes a single Q register. - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1], [], -1>, // dynamic uops - // NEON - // - // Double-register Integer Unary - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1]>, - // - // Quad-register Integer Unary - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1]>, - // - // Double-register Integer Q-Unary - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1]>, - // - // Quad-register Integer CountQ-Unary - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1]>, - // - // Double-register Integer Binary - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Quad-register Integer Binary - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-register Integer Subtract - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Quad-register Integer Subtract - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-register Integer Shift - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Quad-register Integer Shift - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-register Integer Shift (4 cycle) - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register Integer Shift (4 cycle) - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Double-register Integer Binary (4 cycle) - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register Integer Binary (4 cycle) - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Double-register Integer Subtract (4 cycle) - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register Integer Subtract (4 cycle) - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - - // - // Double-register Integer Count - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Quad-register Integer Count - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-register Absolute Difference and Accumulate - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1, 1]>, - // - // Quad-register Absolute Difference and Accumulate - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1, 1]>, - // - // Double-register Integer Pair Add Long - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register Integer Pair Add Long - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - - // - // Double-register Integer Multiply (.8, .16) - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Quad-register Integer Multiply (.8, .16) - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - - // - // Double-register Integer Multiply (.32) - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Quad-register Integer Multiply (.32) - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Double-register Integer Multiply-Accumulate (.8, .16) - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1, 1]>, - // - // Double-register Integer Multiply-Accumulate (.32) - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1, 1]>, - // - // Quad-register Integer Multiply-Accumulate (.8, .16) - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1, 1]>, - // - // Quad-register Integer Multiply-Accumulate (.32) - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1, 1]>, - - // - // Move - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - // - // Move Immediate - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2]>, - // - // Double-register Permute Move - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [2, 1]>, - // - // Quad-register Permute Move - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [2, 1]>, - // - // Integer to Single-precision Move - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0]>], - [6, 1]>, - // - // Integer to Double-precision Move - InstrItinData, - InstrStage<1, [SW_LS]>], - [4, 1, 1]>, - // - // Single-precision to Integer Move - InstrItinData, - InstrStage<1, [SW_LS]>], - [3, 1]>, - // - // Double-precision to Integer Move - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_LS]>], - [3, 4, 1]>, - // - // Integer to Lane Move - // FIXME: I think this is correct, but it is not clear from the tuning guide. - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0]>], - [6, 1]>, - - // - // Vector narrow move - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [2, 1]>, - // - // Double-register FP Unary - // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here, - // and they issue on a different pipeline. - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - // - // Quad-register FP Unary - // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here, - // and they issue on a different pipeline. - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - // - // Double-register FP Binary - // FIXME: We're using this itin for many instructions. - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - - // - // VPADD, etc. - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Double-register FP VMUL - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Quad-register FP Binary - InstrItinData, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register FP VMUL - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Double-register FP Multiple-Accumulate - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Quad-register FP Multiple-Accumulate - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-register Fused FP Multiple-Accumulate - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Quad-register FusedF P Multiple-Accumulate - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-register Reciprical Step - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Quad-register Reciprical Step - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-register Permute - // FIXME: The latencies are unclear from the documentation. - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [3, 4, 3, 4]>, - // - // Quad-register Permute - // FIXME: The latencies are unclear from the documentation. - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [3, 4, 3, 4]>, - // - // Quad-register Permute (3 cycle issue on A9) - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [3, 4, 3, 4]>, - - // - // Double-register VEXT - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [2, 1, 1]>, - // - // Quad-register VEXT - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [2, 1, 1]>, - // - // VTB - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [2, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 3, 3]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [6, 1, 3, 5, 5]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 3, 5, 7, 7]>, - // - // VTBX - InstrItinData, - InstrStage<1, [SW_ALU1]>], - [2, 1, 1]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 3, 3]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [6, 1, 3, 5, 5]>, - InstrItinData, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 3, 5, 7, 7]> -]>; - -// ===---------------------------------------------------------------------===// -// This following definitions describe the simple machine model which -// will replace itineraries. - // Swift machine model for scheduling and other instruction cost heuristics. def SwiftModel : SchedMachineModel { let IssueWidth = 3; // 3 micro-ops are dispatched per cycle. let MicroOpBufferSize = 45; // Based on NEON renamed registers. let LoadLatency = 3; let MispredictPenalty = 14; // A branch direction mispredict. - - let Itineraries = SwiftItineraries; } // Swift predicates. Index: lib/Target/ARM/ARMSubtarget.h =================================================================== --- lib/Target/ARM/ARMSubtarget.h +++ lib/Target/ARM/ARMSubtarget.h @@ -429,6 +429,9 @@ /// compiler runtime or math libraries. bool hasSinCos() const; + /// Returns true if machine scheduler should be enabled. + bool enableMachineScheduler() const override; + /// True for some subtargets at > -O0. bool enablePostRAScheduler() const override; Index: lib/Target/ARM/ARMSubtarget.cpp =================================================================== --- lib/Target/ARM/ARMSubtarget.cpp +++ lib/Target/ARM/ARMSubtarget.cpp @@ -332,12 +332,26 @@ return SchedModel.MispredictPenalty; } +static bool isOutOfOrder(const MCSchedModel &Model) { + return Model.MicroOpBufferSize > 1; +} + bool ARMSubtarget::hasSinCos() const { return getTargetTriple().isiOS() && !getTargetTriple().isOSVersionLT(7, 0); } +bool ARMSubtarget::enableMachineScheduler() const { + // Enable the MachineScheduler before register allocation. This should be + // good for all out-of-order CPUs but I restricted it to swift for now. + return isOutOfOrder(getSchedModel()) && isSwift(); +} + // This overrides the PostRAScheduler bit in the SchedModel for any CPU. bool ARMSubtarget::enablePostRAScheduler() const { + // Do not use PostRAListSched on out of order CPUs. This should be good for + // all out-of-order CPUs but I restricted it to swift for now. + if (isOutOfOrder(getSchedModel()) && isSwift()) + return false; return (!isThumb() || hasThumb2()); } Index: test/CodeGen/ARM/adv-copy-opt.ll =================================================================== --- test/CodeGen/ARM/adv-copy-opt.ll +++ test/CodeGen/ARM/adv-copy-opt.ll @@ -11,25 +11,25 @@ ; r0 = r0 / r2 ; r1 = r1 / r3 ; -; NOOPT: vmov [[B:d[0-9]+]], r2, r3 -; NOOPT-NEXT: vmov [[A:d[0-9]+]], r0, r1 +; NOOPT: vmov [[A:d[0-9]+]], r0, r1 +; NOOPT-NEXT: vmov [[B:d[0-9]+]], r2, r3 ; Move the low part of B into a register. ; Unfortunately, we cannot express that the 's' register is the low ; part of B, i.e., sIdx == BIdx x 2. E.g., B = d1, B_low = s2. ; NOOPT-NEXT: vmov [[B_LOW:r[0-9]+]], s{{[0-9]+}} -; NOOPT-NEXT: vmov [[A_LOW:r[0-9]+]], s{{[0-9]+}} -; NOOPT-NEXT: udiv [[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]] ; NOOPT-NEXT: vmov [[B_HIGH:r[0-9]+]], s{{[0-9]+}} +; NOOPT-NEXT: vmov [[A_LOW:r[0-9]+]], s{{[0-9]+}} ; NOOPT-NEXT: vmov [[A_HIGH:r[0-9]+]], s{{[0-9]+}} -; NOOPT-NEXT: udiv [[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]] +; NOOPT-NEXT: udiv [[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]] ; NOOPT-NEXT: vmov.32 [[RES:d[0-9]+]][0], [[RES_LOW]] +; NOOPT-NEXT: udiv [[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]] ; NOOPT-NEXT: vmov.32 [[RES]][1], [[RES_HIGH]] ; NOOPT-NEXT: vmov r0, r1, [[RES]] ; NOOPT-NEXT: bx lr ; ; OPT-NOT: vmov -; OPT: udiv r0, r0, r2 -; OPT-NEXT: udiv r1, r1, r3 +; OPT: udiv r1, r1, r3 +; OPT-NEXT: udiv r0, r0, r2 ; OPT-NEXT: bx lr define <2 x i32> @simpleVectorDiv(<2 x i32> %A, <2 x i32> %B) nounwind { entry: Index: test/CodeGen/ARM/avoid-cpsr-rmw.ll =================================================================== --- test/CodeGen/ARM/avoid-cpsr-rmw.ll +++ test/CodeGen/ARM/avoid-cpsr-rmw.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT ; Avoid some 's' 16-bit instruction which partially update CPSR (and add false ; dependency) when it isn't dependent on last CPSR defining instruction. ; rdar://8928208 @@ -7,8 +7,10 @@ define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone { entry: ; CHECK-LABEL: t1: -; CHECK: muls [[REG:(r[0-9]+)]], r3, r2 -; CHECK-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0 +; CHECK-CORTEX: muls [[REG:(r[0-9]+)]], r3, r2 +; CHECK-CORTEX-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0 +; CHECK-SWIFT: muls [[REG2:(r[0-9]+)]], r1, r0 +; CHECK-SWIFT-NEXT: mul [[REG:(r[0-9]+)]], r2, r3 ; CHECK-NEXT: muls r0, [[REG]], [[REG2]] %0 = mul nsw i32 %a, %b %1 = mul nsw i32 %c, %d @@ -21,8 +23,7 @@ define void @t2(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind { entry: ; CHECK-LABEL: t2: - %tobool7 = icmp eq i32* %ptr2, null - br i1 %tobool7, label %while.end, label %while.body + br label %while.body while.body: ; CHECK: while.body @@ -55,8 +56,7 @@ define void @t3(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind minsize { entry: ; CHECK-LABEL: t3: - %tobool7 = icmp eq i32* %ptr2, null - br i1 %tobool7, label %while.end, label %while.body + br label %while.body while.body: ; CHECK: while.body Index: test/CodeGen/ARM/cmpxchg-idioms.ll =================================================================== --- test/CodeGen/ARM/cmpxchg-idioms.ll +++ test/CodeGen/ARM/cmpxchg-idioms.ll @@ -15,14 +15,14 @@ ; CHECK: bne [[LOOP]] ; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}} -; CHECK: movs r0, #1 ; CHECK: dmb ish +; CHECK: movs r0, #1 ; CHECK: bx lr ; CHECK: [[FAILED]]: ; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}} -; CHECK: movs r0, #0 ; CHECK: dmb ish +; CHECK: movs r0, #0 ; CHECK: bx lr %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst @@ -34,8 +34,8 @@ define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) { ; CHECK-LABEL: test_return_bool: -; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1 ; CHECK: dmb ishst +; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1 ; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]: ; CHECK: ldrexb [[LOADED:r[0-9]+]], [r0] Index: test/CodeGen/ARM/test-sharedidx.ll =================================================================== --- test/CodeGen/ARM/test-sharedidx.ll +++ test/CodeGen/ARM/test-sharedidx.ll @@ -20,8 +20,8 @@ for.body: ; preds = %entry, %for.body.3 ; CHECK: %for.body -; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! -; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! +; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! +; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! %i.09 = phi i32 [ %add5.3, %for.body.3 ], [ 0, %entry ] %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.09 %0 = load i8, i8* %arrayidx, align 1 @@ -42,8 +42,8 @@ for.body.1: ; preds = %for.body ; CHECK: %for.body.1 -; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! -; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! +; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! +; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %add5 %2 = load i8, i8* %arrayidx.1, align 1 %conv6.1 = zext i8 %2 to i32 @@ -60,8 +60,8 @@ for.body.2: ; preds = %for.body.1 ; CHECK: %for.body.2 -; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! -; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! +; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! +; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %add5.1 %4 = load i8, i8* %arrayidx.2, align 1 %conv6.2 = zext i8 %4 to i32 @@ -78,8 +78,8 @@ for.body.3: ; preds = %for.body.2 ; CHECK: %for.body.3 -; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! -; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! +; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! +; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %add5.2 %6 = load i8, i8* %arrayidx.3, align 1 %conv6.3 = zext i8 %6 to i32 Index: test/CodeGen/ARM/vector-load.ll =================================================================== --- test/CodeGen/ARM/vector-load.ll +++ test/CodeGen/ARM/vector-load.ll @@ -238,12 +238,12 @@ define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) { ;CHECK-LABEL: zextload_v8i8tov8i32_fake_update: -;CHECK: ldr.w r[[PTRREG:[0-9]+]], [r0] +;CHECK: ldr r[[PTRREG:[0-9]+]], [r0] ;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r[[PTRREG]]:32] ;CHECK: add.w r[[INCREG:[0-9]+]], r[[PTRREG]], #16 -;CHECK: str.w r[[INCREG]], [r0] ;CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} ;CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} +;CHECK: str r[[INCREG]], [r0] %A = load <4 x i8>*, <4 x i8>** %ptr %lA = load <4 x i8>, <4 x i8>* %A, align 4 %inc = getelementptr <4 x i8>, <4 x i8>* %A, i38 4 Index: test/CodeGen/ARM/vector-store.ll =================================================================== --- test/CodeGen/ARM/vector-store.ll +++ test/CodeGen/ARM/vector-store.ll @@ -228,9 +228,9 @@ ;CHECK: ldr.w r9, [sp] ;CHECK: vmov {{d[0-9]+}}, r3, r9 ;CHECK: vmov {{d[0-9]+}}, r1, r2 +;CHECK: ldr r[[PTRREG:[0-9]+]], [r0] ;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}} ;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}} -;CHECK: ldr r[[PTRREG:[0-9]+]], [r0] ;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32] %A = load <4 x i8>*, <4 x i8>** %ptr %trunc = trunc <4 x i32> %val to <4 x i8> @@ -243,10 +243,10 @@ ;CHECK: ldr.w r9, [sp] ;CHECK: vmov {{d[0-9]+}}, r3, r9 ;CHECK: vmov {{d[0-9]+}}, r1, r2 -;CHECK: movs [[IMM16:r[0-9]+]], #16 +;CHECK: ldr r[[PTRREG:[0-9]+]], [r0] ;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}} ;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}} -;CHECK: ldr r[[PTRREG:[0-9]+]], [r0] +;CHECK: movs [[IMM16:r[0-9]+]], #16 ;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32], [[IMM16]] ;CHECK: str r[[PTRREG]], [r0] %A = load <4 x i8>*, <4 x i8>** %ptr