243 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			243 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| /* SPDX-License-Identifier: MIT */
 | |
| /*
 | |
|  * memcpy - copy memory area
 | |
|  *
 | |
|  * Copyright (c) 2012-2020, Arm Limited.
 | |
|  */
 | |
| 
 | |
| /* Assumptions:
 | |
|  *
 | |
|  * ARMv8-a, AArch64, unaligned accesses.
 | |
|  *
 | |
|  */
 | |
| 
 | |
| #include "asmdefs.h"
 | |
| 
 | |
| #define dstin	x0
 | |
| #define src	x1
 | |
| #define count	x2
 | |
| #define dst	x3
 | |
| #define srcend	x4
 | |
| #define dstend	x5
 | |
| #define A_l	x6
 | |
| #define A_lw	w6
 | |
| #define A_h	x7
 | |
| #define B_l	x8
 | |
| #define B_lw	w8
 | |
| #define B_h	x9
 | |
| #define C_l	x10
 | |
| #define C_lw	w10
 | |
| #define C_h	x11
 | |
| #define D_l	x12
 | |
| #define D_h	x13
 | |
| #define E_l	x14
 | |
| #define E_h	x15
 | |
| #define F_l	x16
 | |
| #define F_h	x17
 | |
| #define G_l	count
 | |
| #define G_h	dst
 | |
| #define H_l	src
 | |
| #define H_h	srcend
 | |
| #define tmp1	x14
 | |
| 
 | |
| /* This implementation handles overlaps and supports both memcpy and memmove
 | |
|    from a single entry point.  It uses unaligned accesses and branchless
 | |
|    sequences to keep the code small, simple and improve performance.
 | |
| 
 | |
|    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
 | |
|    copies of up to 128 bytes, and large copies.  The overhead of the overlap
 | |
|    check is negligible since it is only required for large copies.
 | |
| 
 | |
|    Large copies use a software pipelined loop processing 64 bytes per iteration.
 | |
|    The destination pointer is 16-byte aligned to minimize unaligned accesses.
 | |
|    The loop tail is handled by always copying 64 bytes from the end.
 | |
| */
 | |
| 
 | |
| ENTRY_ALIAS (memmove)
 | |
| ENTRY (memcpy)
 | |
| 	PTR_ARG (0)
 | |
| 	PTR_ARG (1)
 | |
| 	SIZE_ARG (2)
 | |
| 	add	srcend, src, count
 | |
| 	add	dstend, dstin, count
 | |
| 	cmp	count, 128
 | |
| 	b.hi	L(copy_long)
 | |
| 	cmp	count, 32
 | |
| 	b.hi	L(copy32_128)
 | |
| 
 | |
| 	/* Small copies: 0..32 bytes.  */
 | |
| 	cmp	count, 16
 | |
| 	b.lo	L(copy16)
 | |
| 	ldp	A_l, A_h, [src]
 | |
| 	ldp	D_l, D_h, [srcend, -16]
 | |
| 	stp	A_l, A_h, [dstin]
 | |
| 	stp	D_l, D_h, [dstend, -16]
 | |
| 	ret
 | |
| 
 | |
| 	/* Copy 8-15 bytes.  */
 | |
| L(copy16):
 | |
| 	tbz	count, 3, L(copy8)
 | |
| 	ldr	A_l, [src]
 | |
| 	ldr	A_h, [srcend, -8]
 | |
| 	str	A_l, [dstin]
 | |
| 	str	A_h, [dstend, -8]
 | |
| 	ret
 | |
| 
 | |
| 	.p2align 3
 | |
| 	/* Copy 4-7 bytes.  */
 | |
| L(copy8):
 | |
| 	tbz	count, 2, L(copy4)
 | |
| 	ldr	A_lw, [src]
 | |
| 	ldr	B_lw, [srcend, -4]
 | |
| 	str	A_lw, [dstin]
 | |
| 	str	B_lw, [dstend, -4]
 | |
| 	ret
 | |
| 
 | |
| 	/* Copy 0..3 bytes using a branchless sequence.  */
 | |
| L(copy4):
 | |
| 	cbz	count, L(copy0)
 | |
| 	lsr	tmp1, count, 1
 | |
| 	ldrb	A_lw, [src]
 | |
| 	ldrb	C_lw, [srcend, -1]
 | |
| 	ldrb	B_lw, [src, tmp1]
 | |
| 	strb	A_lw, [dstin]
 | |
| 	strb	B_lw, [dstin, tmp1]
 | |
| 	strb	C_lw, [dstend, -1]
 | |
| L(copy0):
 | |
| 	ret
 | |
| 
 | |
| 	.p2align 4
 | |
| 	/* Medium copies: 33..128 bytes.  */
 | |
| L(copy32_128):
 | |
| 	ldp	A_l, A_h, [src]
 | |
| 	ldp	B_l, B_h, [src, 16]
 | |
| 	ldp	C_l, C_h, [srcend, -32]
 | |
| 	ldp	D_l, D_h, [srcend, -16]
 | |
| 	cmp	count, 64
 | |
| 	b.hi	L(copy128)
 | |
| 	stp	A_l, A_h, [dstin]
 | |
| 	stp	B_l, B_h, [dstin, 16]
 | |
| 	stp	C_l, C_h, [dstend, -32]
 | |
| 	stp	D_l, D_h, [dstend, -16]
 | |
| 	ret
 | |
| 
 | |
| 	.p2align 4
 | |
| 	/* Copy 65..128 bytes.  */
 | |
| L(copy128):
 | |
| 	ldp	E_l, E_h, [src, 32]
 | |
| 	ldp	F_l, F_h, [src, 48]
 | |
| 	cmp	count, 96
 | |
| 	b.ls	L(copy96)
 | |
| 	ldp	G_l, G_h, [srcend, -64]
 | |
| 	ldp	H_l, H_h, [srcend, -48]
 | |
| 	stp	G_l, G_h, [dstend, -64]
 | |
| 	stp	H_l, H_h, [dstend, -48]
 | |
| L(copy96):
 | |
| 	stp	A_l, A_h, [dstin]
 | |
| 	stp	B_l, B_h, [dstin, 16]
 | |
| 	stp	E_l, E_h, [dstin, 32]
 | |
| 	stp	F_l, F_h, [dstin, 48]
 | |
| 	stp	C_l, C_h, [dstend, -32]
 | |
| 	stp	D_l, D_h, [dstend, -16]
 | |
| 	ret
 | |
| 
 | |
| 	.p2align 4
 | |
| 	/* Copy more than 128 bytes.  */
 | |
| L(copy_long):
 | |
| 	/* Use backwards copy if there is an overlap.  */
 | |
| 	sub	tmp1, dstin, src
 | |
| 	cbz	tmp1, L(copy0)
 | |
| 	cmp	tmp1, count
 | |
| 	b.lo	L(copy_long_backwards)
 | |
| 
 | |
| 	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
 | |
| 
 | |
| 	ldp	D_l, D_h, [src]
 | |
| 	and	tmp1, dstin, 15
 | |
| 	bic	dst, dstin, 15
 | |
| 	sub	src, src, tmp1
 | |
| 	add	count, count, tmp1	/* Count is now 16 too large.  */
 | |
| 	ldp	A_l, A_h, [src, 16]
 | |
| 	stp	D_l, D_h, [dstin]
 | |
| 	ldp	B_l, B_h, [src, 32]
 | |
| 	ldp	C_l, C_h, [src, 48]
 | |
| 	ldp	D_l, D_h, [src, 64]!
 | |
| 	subs	count, count, 128 + 16	/* Test and readjust count.  */
 | |
| 	b.ls	L(copy64_from_end)
 | |
| 
 | |
| L(loop64):
 | |
| 	stp	A_l, A_h, [dst, 16]
 | |
| 	ldp	A_l, A_h, [src, 16]
 | |
| 	stp	B_l, B_h, [dst, 32]
 | |
| 	ldp	B_l, B_h, [src, 32]
 | |
| 	stp	C_l, C_h, [dst, 48]
 | |
| 	ldp	C_l, C_h, [src, 48]
 | |
| 	stp	D_l, D_h, [dst, 64]!
 | |
| 	ldp	D_l, D_h, [src, 64]!
 | |
| 	subs	count, count, 64
 | |
| 	b.hi	L(loop64)
 | |
| 
 | |
| 	/* Write the last iteration and copy 64 bytes from the end.  */
 | |
| L(copy64_from_end):
 | |
| 	ldp	E_l, E_h, [srcend, -64]
 | |
| 	stp	A_l, A_h, [dst, 16]
 | |
| 	ldp	A_l, A_h, [srcend, -48]
 | |
| 	stp	B_l, B_h, [dst, 32]
 | |
| 	ldp	B_l, B_h, [srcend, -32]
 | |
| 	stp	C_l, C_h, [dst, 48]
 | |
| 	ldp	C_l, C_h, [srcend, -16]
 | |
| 	stp	D_l, D_h, [dst, 64]
 | |
| 	stp	E_l, E_h, [dstend, -64]
 | |
| 	stp	A_l, A_h, [dstend, -48]
 | |
| 	stp	B_l, B_h, [dstend, -32]
 | |
| 	stp	C_l, C_h, [dstend, -16]
 | |
| 	ret
 | |
| 
 | |
| 	.p2align 4
 | |
| 
 | |
| 	/* Large backwards copy for overlapping copies.
 | |
| 	   Copy 16 bytes and then align dst to 16-byte alignment.  */
 | |
| L(copy_long_backwards):
 | |
| 	ldp	D_l, D_h, [srcend, -16]
 | |
| 	and	tmp1, dstend, 15
 | |
| 	sub	srcend, srcend, tmp1
 | |
| 	sub	count, count, tmp1
 | |
| 	ldp	A_l, A_h, [srcend, -16]
 | |
| 	stp	D_l, D_h, [dstend, -16]
 | |
| 	ldp	B_l, B_h, [srcend, -32]
 | |
| 	ldp	C_l, C_h, [srcend, -48]
 | |
| 	ldp	D_l, D_h, [srcend, -64]!
 | |
| 	sub	dstend, dstend, tmp1
 | |
| 	subs	count, count, 128
 | |
| 	b.ls	L(copy64_from_start)
 | |
| 
 | |
| L(loop64_backwards):
 | |
| 	stp	A_l, A_h, [dstend, -16]
 | |
| 	ldp	A_l, A_h, [srcend, -16]
 | |
| 	stp	B_l, B_h, [dstend, -32]
 | |
| 	ldp	B_l, B_h, [srcend, -32]
 | |
| 	stp	C_l, C_h, [dstend, -48]
 | |
| 	ldp	C_l, C_h, [srcend, -48]
 | |
| 	stp	D_l, D_h, [dstend, -64]!
 | |
| 	ldp	D_l, D_h, [srcend, -64]!
 | |
| 	subs	count, count, 64
 | |
| 	b.hi	L(loop64_backwards)
 | |
| 
 | |
| 	/* Write the last iteration and copy 64 bytes from the start.  */
 | |
| L(copy64_from_start):
 | |
| 	ldp	G_l, G_h, [src, 48]
 | |
| 	stp	A_l, A_h, [dstend, -16]
 | |
| 	ldp	A_l, A_h, [src, 32]
 | |
| 	stp	B_l, B_h, [dstend, -32]
 | |
| 	ldp	B_l, B_h, [src, 16]
 | |
| 	stp	C_l, C_h, [dstend, -48]
 | |
| 	ldp	C_l, C_h, [src]
 | |
| 	stp	D_l, D_h, [dstend, -64]
 | |
| 	stp	G_l, G_h, [dstin, 48]
 | |
| 	stp	A_l, A_h, [dstin, 32]
 | |
| 	stp	B_l, B_h, [dstin, 16]
 | |
| 	stp	C_l, C_h, [dstin]
 | |
| 	ret
 | |
| 
 | |
| END (memcpy)
 |