diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -636,3 +636,109 @@
   %5 = or i32 %4, %2
   ret i32 %5
 }
+
+; For or operation, one operand is a left shift of another operand.
+; Use orr with left-shifted operand is better than bfi.
+define i64 @test_orr_not_bfi_i64(i64 %0) {
+; CHECK-LABEL: test_orr_not_bfi_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xff
+; CHECK-NEXT:    bfi x8, x0, #8, #8
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ret
+  %2 = and i64 %0, 255
+  %3 = shl i64 %2, 8
+  %4 = or i64 %2, %3
+  ret i64 %4
+}
+
+; bfi is better than orr, since it would simplify away two instructions
+; (%mask and %bit-field-pos-op).
+define i32 @test_bfi_not_orr_i32(i32 %0, i32 %1) {
+; CHECK-LABEL: test_bfi_not_orr_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w1, #0xff
+; CHECK-NEXT:    bfi w8, w0, #8, #8
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+  %bfi_dst = and i32 %1, 255
+  %mask = and i32 %0, 255
+  %bit-field-pos-op = shl i32 %mask, 8
+  %or_res = or i32 %bit-field-pos-op, %bfi_dst
+  ret i32 %or_res
+}
+
+; orr is better than bfi, since both simplify away one instruction (%3)
+; while orr has shorter latency and higher throughput.
+define i32 @test_orr_not_bfi_i32(i32 %0) {
+; CHECK-LABEL: test_orr_not_bfi_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    bfi w8, w0, #8, #8
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+  %2 = and i32 %0, 255
+  %3 = shl i32 %2, 8
+  %4 = or i32 %2, %3
+  ret i32 %4
+}
+
+; bfxil is better than orr, since it would simplify away two instructions
+; (%mask and %bit-field-extract-op).
+define i64 @test_bfxil_not_orr_i64(i64 %0, i64 %1) {
+; CHECK-LABEL: test_bfxil_not_orr_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x0, x0, #0xff000
+; CHECK-NEXT:    bfxil x0, x1, #12, #8
+; CHECK-NEXT:    ret
+  %shifted-mask = and i64 %1, 1044480
+  %bfi-dst = and i64 %0, 1044480
+  %bit-field-extract-op = lshr i64 %shifted-mask, 12
+  %or_res = or i64 %bit-field-extract-op, %bfi-dst
+  ret i64 %or_res
+}
+
+; orr is better than bfxil, since one operand is the right shift of  another
+; operand.
+define i64 @orr_not_bfxil_test2_i64(i64 %0) {
+; CHECK-LABEL: orr_not_bfxil_test2_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xff000
+; CHECK-NEXT:    bfxil x8, x0, #12, #8
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ret
+  %2 = and i64 %0, 1044480 ; 0xff000
+  %3 = lshr i64 %2, 12
+  %4 = or i64 %2, %3
+  ret i64 %4
+}
+
+; bfxil simplifies away two instructions (that computes %shifted-mask and
+; %bit-field-extract-op respectively), so it's better than orr (which
+; simplifies away at most one shift).
+define i32 @test_bfxil_not_orr_i32(i32 %0, i32 %1) {
+; CHECK-LABEL: test_bfxil_not_orr_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w0, w0, #0xff000
+; CHECK-NEXT:    bfxil w0, w1, #12, #8
+; CHECK-NEXT:    ret
+  %shifted-mask = and i32 %1, 1044480
+  %bfxil-dst = and i32 %0, 1044480
+  %bit-field-extract-op = lshr i32 %shifted-mask, 12
+  %or_res = or i32 %bit-field-extract-op, %bfxil-dst
+  ret i32 %or_res
+}
+
+; one operand is the shift of another operand, so orr is better.
+define i32 @orr_not_bfxil_test2_i32(i32 %0) {
+; CHECK-LABEL: orr_not_bfxil_test2_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff000
+; CHECK-NEXT:    bfxil w8, w0, #12, #8
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+  %2 = and i32 %0, 1044480  ; 0xff000
+  %3 = lshr i32 %2, 12
+  %4 = or i32 %2, %3
+  ret i32 %4
+}