arm: Use optimized memcpy and memset from linux
Using optimized versions of memset and memcpy from linux brings a quite
noticeable speed (x2 or better) improvement for these two functions.
Here are some numbers for test done with jadecpu
                           | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)|
                           |        | +patch |        | +patch |
---------------------------+--------+--------+--------+--------+
Reset to prompt            |  438ms |  330ms |  228ms |  120ms |
                           |        |        |        |        |
TFTP a 3MB img             | 4782ms | 3428ms | 3245ms | 2820ms |
                           |        |        |        |        |
FATLOAD USB a 3MB img*     | 8515ms | 8510ms | ------ | ------ |
                           |        |        |        |        |
BOOTM LZO img in RAM       | 3473ms | 3168ms |  592ms |  592ms |
 where CRC is              |  615ms |  615ms |   54ms |   54ms |
 uncompress                | 2460ms | 2462ms |  450ms |  451ms |
 final boot_elf            |  376ms |   68ms |   65ms |   65ms |
                           |        |        |        |        |
BOOTM LZO img in FLASH     | 3207ms | 2902ms | 1050ms | 1050ms |
 where CRC is              |  600ms |  600ms |  135ms |  135ms |
 uncompress                | 2209ms | 2211ms |  828ms |  828ms |
                           |        |        |        |        |
Copy 1.4MB from NOR to RAM |  134ms |   72ms |  120ms |   70ms |
(1) No dcache
(2) dcache enabled in board_init
*Does not work when dcache is on
Size impact:
C version:
   text    data     bss     dec     hex filename
 202862   18912  266456  488230   77326 u-boot
ASM version:
   text    data     bss     dec     hex filename
 203798   18912  266288  488998   77626 u-boot
222712  u-boot.bin
Signed-off-by: Matthias Weisser <weisserm@arcor.de>
			
			
This commit is contained in:
		
							parent
							
								
									b65a77a861
								
							
						
					
					
						commit
						d8834a1323
					
				
							
								
								
									
										6
									
								
								README
								
								
								
								
							
							
						
						
									
										6
									
								
								README
								
								
								
								
							|  | @ -2944,6 +2944,12 @@ Low Level (hardware related) configuration options: | ||||||
| 		that is executed before the actual U-Boot. E.g. when | 		that is executed before the actual U-Boot. E.g. when | ||||||
| 		compiling a NAND SPL. | 		compiling a NAND SPL. | ||||||
| 
 | 
 | ||||||
|  | - CONFIG_USE_ARCH_MEMCPY | ||||||
|  |   CONFIG_USE_ARCH_MEMSET | ||||||
|  | 		If these options are used a optimized version of memcpy/memset will | ||||||
|  | 		be used if available. These functions may be faster under some | ||||||
|  | 		conditions but may increase the binary size. | ||||||
|  | 
 | ||||||
| Building the Software: | Building the Software: | ||||||
| ====================== | ====================== | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -0,0 +1,60 @@ | ||||||
|  | /*
 | ||||||
|  |  *  arch/arm/include/asm/assembler.h | ||||||
|  |  * | ||||||
|  |  *  Copyright (C) 1996-2000 Russell King | ||||||
|  |  * | ||||||
|  |  * This program is free software; you can redistribute it and/or modify | ||||||
|  |  * it under the terms of the GNU General Public License version 2 as | ||||||
|  |  * published by the Free Software Foundation. | ||||||
|  |  * | ||||||
|  |  *  This file contains arm architecture specific defines | ||||||
|  |  *  for the different processors. | ||||||
|  |  * | ||||||
|  |  *  Do not include any C declarations in this file - it is included by | ||||||
|  |  *  assembler source. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Endian independent macros for shifting bytes within registers. | ||||||
|  |  */ | ||||||
|  | #ifndef __ARMEB__ | ||||||
|  | #define pull		lsr | ||||||
|  | #define push		lsl | ||||||
|  | #define get_byte_0	lsl #0 | ||||||
|  | #define get_byte_1	lsr #8 | ||||||
|  | #define get_byte_2	lsr #16 | ||||||
|  | #define get_byte_3	lsr #24 | ||||||
|  | #define put_byte_0	lsl #0 | ||||||
|  | #define put_byte_1	lsl #8 | ||||||
|  | #define put_byte_2	lsl #16 | ||||||
|  | #define put_byte_3	lsl #24 | ||||||
|  | #else | ||||||
|  | #define pull		lsl | ||||||
|  | #define push		lsr | ||||||
|  | #define get_byte_0	lsr #24 | ||||||
|  | #define get_byte_1	lsr #16 | ||||||
|  | #define get_byte_2	lsr #8 | ||||||
|  | #define get_byte_3      lsl #0 | ||||||
|  | #define put_byte_0	lsl #24 | ||||||
|  | #define put_byte_1	lsl #16 | ||||||
|  | #define put_byte_2	lsl #8 | ||||||
|  | #define put_byte_3      lsl #0 | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Data preload for architectures that support it | ||||||
|  |  */ | ||||||
|  | #if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ | ||||||
|  | 	defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ | ||||||
|  | 	defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || \ | ||||||
|  | 	defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_7A__) || \ | ||||||
|  | 	defined(__ARM_ARCH_7R__) | ||||||
|  | #define PLD(code...)	code | ||||||
|  | #else | ||||||
|  | #define PLD(code...) | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Cache alligned | ||||||
|  |  */ | ||||||
|  | #define CALGN(code...) code | ||||||
|  | @ -1,6 +1,8 @@ | ||||||
| #ifndef __ASM_ARM_STRING_H | #ifndef __ASM_ARM_STRING_H | ||||||
| #define __ASM_ARM_STRING_H | #define __ASM_ARM_STRING_H | ||||||
| 
 | 
 | ||||||
|  | #include <config.h> | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  * We don't do inline string functions, since the |  * We don't do inline string functions, since the | ||||||
|  * optimised inline asm versions are not small. |  * optimised inline asm versions are not small. | ||||||
|  | @ -12,7 +14,9 @@ extern char * strrchr(const char * s, int c); | ||||||
| #undef __HAVE_ARCH_STRCHR | #undef __HAVE_ARCH_STRCHR | ||||||
| extern char * strchr(const char * s, int c); | extern char * strchr(const char * s, int c); | ||||||
| 
 | 
 | ||||||
| #undef __HAVE_ARCH_MEMCPY | #ifdef CONFIG_USE_ARCH_MEMCPY | ||||||
|  | #define __HAVE_ARCH_MEMCPY | ||||||
|  | #endif | ||||||
| extern void * memcpy(void *, const void *, __kernel_size_t); | extern void * memcpy(void *, const void *, __kernel_size_t); | ||||||
| 
 | 
 | ||||||
| #undef __HAVE_ARCH_MEMMOVE | #undef __HAVE_ARCH_MEMMOVE | ||||||
|  | @ -22,7 +26,9 @@ extern void * memmove(void *, const void *, __kernel_size_t); | ||||||
| extern void * memchr(const void *, int, __kernel_size_t); | extern void * memchr(const void *, int, __kernel_size_t); | ||||||
| 
 | 
 | ||||||
| #undef __HAVE_ARCH_MEMZERO | #undef __HAVE_ARCH_MEMZERO | ||||||
| #undef __HAVE_ARCH_MEMSET | #ifdef CONFIG_USE_ARCH_MEMSET | ||||||
|  | #define __HAVE_ARCH_MEMSET | ||||||
|  | #endif | ||||||
| extern void * memset(void *, int, __kernel_size_t); | extern void * memset(void *, int, __kernel_size_t); | ||||||
| 
 | 
 | ||||||
| #if 0 | #if 0 | ||||||
|  |  | ||||||
|  | @ -44,6 +44,8 @@ COBJS-y	+= cache-cp15.o | ||||||
| endif | endif | ||||||
| COBJS-y	+= interrupts.o | COBJS-y	+= interrupts.o | ||||||
| COBJS-y	+= reset.o | COBJS-y	+= reset.o | ||||||
|  | SOBJS-$(CONFIG_USE_ARCH_MEMSET) += memset.o | ||||||
|  | SOBJS-$(CONFIG_USE_ARCH_MEMCPY) += memcpy.o | ||||||
| 
 | 
 | ||||||
| SRCS	:= $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \
 | SRCS	:= $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \
 | ||||||
| 	   $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c) | 	   $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c) | ||||||
|  |  | ||||||
|  | @ -0,0 +1,241 @@ | ||||||
|  | /* | ||||||
|  |  *  linux/arch/arm/lib/memcpy.S | ||||||
|  |  * | ||||||
|  |  *  Author:	Nicolas Pitre | ||||||
|  |  *  Created:	Sep 28, 2005 | ||||||
|  |  *  Copyright:	MontaVista Software, Inc. | ||||||
|  |  * | ||||||
|  |  *  This program is free software; you can redistribute it and/or modify
 | ||||||
|  |  *  it under the terms of the GNU General Public License version 2 as | ||||||
|  |  *  published by the Free Software Foundation. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | #include <asm/assembler.h> | ||||||
|  | 
 | ||||||
|  | #define W(instr)	instr | ||||||
|  | 
 | ||||||
|  | #define LDR1W_SHIFT	0 | ||||||
|  | #define STR1W_SHIFT	0 | ||||||
|  | 
 | ||||||
|  | 	.macro ldr1w ptr reg abort | ||||||
|  | 	W(ldr) \reg, [\ptr], #4 | ||||||
|  | 	.endm | ||||||
|  | 
 | ||||||
|  | 	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort | ||||||
|  | 	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} | ||||||
|  | 	.endm | ||||||
|  | 
 | ||||||
|  | 	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort | ||||||
|  | 	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} | ||||||
|  | 	.endm | ||||||
|  | 
 | ||||||
|  | 	.macro ldr1b ptr reg cond=al abort | ||||||
|  | 	ldr\cond\()b \reg, [\ptr], #1 | ||||||
|  | 	.endm | ||||||
|  | 
 | ||||||
|  | 	.macro str1w ptr reg abort | ||||||
|  | 	W(str) \reg, [\ptr], #4 | ||||||
|  | 	.endm | ||||||
|  | 
 | ||||||
|  | 	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort | ||||||
|  | 	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} | ||||||
|  | 	.endm | ||||||
|  | 
 | ||||||
|  | 	.macro str1b ptr reg cond=al abort | ||||||
|  | 	str\cond\()b \reg, [\ptr], #1 | ||||||
|  | 	.endm | ||||||
|  | 
 | ||||||
|  | 	.macro enter reg1 reg2 | ||||||
|  | 	stmdb sp!, {r0, \reg1, \reg2} | ||||||
|  | 	.endm | ||||||
|  | 
 | ||||||
|  | 	.macro exit reg1 reg2 | ||||||
|  | 	ldmfd sp!, {r0, \reg1, \reg2} | ||||||
|  | 	.endm | ||||||
|  | 
 | ||||||
|  | 	.text | ||||||
|  | 
 | ||||||
|  | /* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ | ||||||
|  | 
 | ||||||
|  | .globl memcpy
 | ||||||
|  | memcpy: | ||||||
|  | 
 | ||||||
|  | 		enter	r4, lr | ||||||
|  | 
 | ||||||
|  | 		subs	r2, r2, #4 | ||||||
|  | 		blt	8f | ||||||
|  | 		ands	ip, r0, #3 | ||||||
|  | 	PLD(	pld	[r1, #0]		) | ||||||
|  | 		bne	9f | ||||||
|  | 		ands	ip, r1, #3 | ||||||
|  | 		bne	10f | ||||||
|  | 
 | ||||||
|  | 1:		subs	r2, r2, #(28) | ||||||
|  | 		stmfd	sp!, {r5 - r8} | ||||||
|  | 		blt	5f | ||||||
|  | 
 | ||||||
|  | 	CALGN(	ands	ip, r0, #31		) | ||||||
|  | 	CALGN(	rsb	r3, ip, #32		) | ||||||
|  | 	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
 | ||||||
|  | 	CALGN(	bcs	2f			) | ||||||
|  | 	CALGN(	adr	r4, 6f			) | ||||||
|  | 	CALGN(	subs	r2, r2, r3		)  @ C gets set
 | ||||||
|  | 	CALGN(	add	pc, r4, ip		) | ||||||
|  | 
 | ||||||
|  | 	PLD(	pld	[r1, #0]		) | ||||||
|  | 2:	PLD(	subs	r2, r2, #96		) | ||||||
|  | 	PLD(	pld	[r1, #28]		) | ||||||
|  | 	PLD(	blt	4f			) | ||||||
|  | 	PLD(	pld	[r1, #60]		) | ||||||
|  | 	PLD(	pld	[r1, #92]		) | ||||||
|  | 
 | ||||||
|  | 3:	PLD(	pld	[r1, #124]		) | ||||||
|  | 4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f | ||||||
|  | 		subs	r2, r2, #32 | ||||||
|  | 		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f | ||||||
|  | 		bge	3b | ||||||
|  | 	PLD(	cmn	r2, #96			) | ||||||
|  | 	PLD(	bge	4b			) | ||||||
|  | 
 | ||||||
|  | 5:		ands	ip, r2, #28 | ||||||
|  | 		rsb	ip, ip, #32 | ||||||
|  | #if LDR1W_SHIFT > 0 | ||||||
|  | 		lsl	ip, ip, #LDR1W_SHIFT | ||||||
|  | #endif | ||||||
|  | 		addne	pc, pc, ip		@ C is always clear here
 | ||||||
|  | 		b	7f | ||||||
|  | 6: | ||||||
|  | 		.rept	(1 << LDR1W_SHIFT) | ||||||
|  | 		W(nop) | ||||||
|  | 		.endr | ||||||
|  | 		ldr1w	r1, r3, abort=20f | ||||||
|  | 		ldr1w	r1, r4, abort=20f | ||||||
|  | 		ldr1w	r1, r5, abort=20f | ||||||
|  | 		ldr1w	r1, r6, abort=20f | ||||||
|  | 		ldr1w	r1, r7, abort=20f | ||||||
|  | 		ldr1w	r1, r8, abort=20f | ||||||
|  | 		ldr1w	r1, lr, abort=20f | ||||||
|  | 
 | ||||||
|  | #if LDR1W_SHIFT < STR1W_SHIFT | ||||||
|  | 		lsl	ip, ip, #STR1W_SHIFT - LDR1W_SHIFT | ||||||
|  | #elif LDR1W_SHIFT > STR1W_SHIFT | ||||||
|  | 		lsr	ip, ip, #LDR1W_SHIFT - STR1W_SHIFT | ||||||
|  | #endif | ||||||
|  | 		add	pc, pc, ip | ||||||
|  | 		nop | ||||||
|  | 		.rept	(1 << STR1W_SHIFT) | ||||||
|  | 		W(nop) | ||||||
|  | 		.endr | ||||||
|  | 		str1w	r0, r3, abort=20f | ||||||
|  | 		str1w	r0, r4, abort=20f | ||||||
|  | 		str1w	r0, r5, abort=20f | ||||||
|  | 		str1w	r0, r6, abort=20f | ||||||
|  | 		str1w	r0, r7, abort=20f | ||||||
|  | 		str1w	r0, r8, abort=20f | ||||||
|  | 		str1w	r0, lr, abort=20f | ||||||
|  | 
 | ||||||
|  | 	CALGN(	bcs	2b			) | ||||||
|  | 
 | ||||||
|  | 7:		ldmfd	sp!, {r5 - r8} | ||||||
|  | 
 | ||||||
|  | 8:		movs	r2, r2, lsl #31 | ||||||
|  | 		ldr1b	r1, r3, ne, abort=21f | ||||||
|  | 		ldr1b	r1, r4, cs, abort=21f | ||||||
|  | 		ldr1b	r1, ip, cs, abort=21f | ||||||
|  | 		str1b	r0, r3, ne, abort=21f | ||||||
|  | 		str1b	r0, r4, cs, abort=21f | ||||||
|  | 		str1b	r0, ip, cs, abort=21f | ||||||
|  | 
 | ||||||
|  | 		exit	r4, pc | ||||||
|  | 
 | ||||||
|  | 9:		rsb	ip, ip, #4 | ||||||
|  | 		cmp	ip, #2 | ||||||
|  | 		ldr1b	r1, r3, gt, abort=21f | ||||||
|  | 		ldr1b	r1, r4, ge, abort=21f | ||||||
|  | 		ldr1b	r1, lr, abort=21f | ||||||
|  | 		str1b	r0, r3, gt, abort=21f | ||||||
|  | 		str1b	r0, r4, ge, abort=21f | ||||||
|  | 		subs	r2, r2, ip | ||||||
|  | 		str1b	r0, lr, abort=21f | ||||||
|  | 		blt	8b | ||||||
|  | 		ands	ip, r1, #3 | ||||||
|  | 		beq	1b | ||||||
|  | 
 | ||||||
|  | 10:		bic	r1, r1, #3 | ||||||
|  | 		cmp	ip, #2 | ||||||
|  | 		ldr1w	r1, lr, abort=21f | ||||||
|  | 		beq	17f | ||||||
|  | 		bgt	18f | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 		.macro	forward_copy_shift pull push | ||||||
|  | 
 | ||||||
|  | 		subs	r2, r2, #28 | ||||||
|  | 		blt	14f | ||||||
|  | 
 | ||||||
|  | 	CALGN(	ands	ip, r0, #31		) | ||||||
|  | 	CALGN(	rsb	ip, ip, #32		) | ||||||
|  | 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
 | ||||||
|  | 	CALGN(	subcc	r2, r2, ip		) | ||||||
|  | 	CALGN(	bcc	15f			) | ||||||
|  | 
 | ||||||
|  | 11:		stmfd	sp!, {r5 - r9} | ||||||
|  | 
 | ||||||
|  | 	PLD(	pld	[r1, #0]		) | ||||||
|  | 	PLD(	subs	r2, r2, #96		) | ||||||
|  | 	PLD(	pld	[r1, #28]		) | ||||||
|  | 	PLD(	blt	13f			) | ||||||
|  | 	PLD(	pld	[r1, #60]		) | ||||||
|  | 	PLD(	pld	[r1, #92]		) | ||||||
|  | 
 | ||||||
|  | 12:	PLD(	pld	[r1, #124]		) | ||||||
|  | 13:		ldr4w	r1, r4, r5, r6, r7, abort=19f | ||||||
|  | 		mov	r3, lr, pull #\pull | ||||||
|  | 		subs	r2, r2, #32 | ||||||
|  | 		ldr4w	r1, r8, r9, ip, lr, abort=19f | ||||||
|  | 		orr	r3, r3, r4, push #\push | ||||||
|  | 		mov	r4, r4, pull #\pull | ||||||
|  | 		orr	r4, r4, r5, push #\push | ||||||
|  | 		mov	r5, r5, pull #\pull | ||||||
|  | 		orr	r5, r5, r6, push #\push | ||||||
|  | 		mov	r6, r6, pull #\pull | ||||||
|  | 		orr	r6, r6, r7, push #\push | ||||||
|  | 		mov	r7, r7, pull #\pull | ||||||
|  | 		orr	r7, r7, r8, push #\push | ||||||
|  | 		mov	r8, r8, pull #\pull | ||||||
|  | 		orr	r8, r8, r9, push #\push | ||||||
|  | 		mov	r9, r9, pull #\pull | ||||||
|  | 		orr	r9, r9, ip, push #\push | ||||||
|  | 		mov	ip, ip, pull #\pull | ||||||
|  | 		orr	ip, ip, lr, push #\push | ||||||
|  | 		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f | ||||||
|  | 		bge	12b | ||||||
|  | 	PLD(	cmn	r2, #96			) | ||||||
|  | 	PLD(	bge	13b			) | ||||||
|  | 
 | ||||||
|  | 		ldmfd	sp!, {r5 - r9} | ||||||
|  | 
 | ||||||
|  | 14:		ands	ip, r2, #28 | ||||||
|  | 		beq	16f | ||||||
|  | 
 | ||||||
|  | 15:		mov	r3, lr, pull #\pull | ||||||
|  | 		ldr1w	r1, lr, abort=21f | ||||||
|  | 		subs	ip, ip, #4 | ||||||
|  | 		orr	r3, r3, lr, push #\push | ||||||
|  | 		str1w	r0, r3, abort=21f | ||||||
|  | 		bgt	15b | ||||||
|  | 	CALGN(	cmp	r2, #0			) | ||||||
|  | 	CALGN(	bge	11b			) | ||||||
|  | 
 | ||||||
|  | 16:		sub	r1, r1, #(\push / 8) | ||||||
|  | 		b	8b | ||||||
|  | 
 | ||||||
|  | 		.endm | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 		forward_copy_shift	pull=8	push=24 | ||||||
|  | 
 | ||||||
|  | 17:		forward_copy_shift	pull=16	push=16 | ||||||
|  | 
 | ||||||
|  | 18:		forward_copy_shift	pull=24	push=8 | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,126 @@ | ||||||
|  | /* | ||||||
|  |  *  linux/arch/arm/lib/memset.S | ||||||
|  |  * | ||||||
|  |  *  Copyright (C) 1995-2000 Russell King | ||||||
|  |  * | ||||||
|  |  * This program is free software; you can redistribute it and/or modify
 | ||||||
|  |  * it under the terms of the GNU General Public License version 2 as | ||||||
|  |  * published by the Free Software Foundation. | ||||||
|  |  * | ||||||
|  |  *  ASM optimised string functions | ||||||
|  |  */ | ||||||
|  | #include <asm/assembler.h> | ||||||
|  | 
 | ||||||
|  | 	.text | ||||||
|  | 	.align	5
 | ||||||
|  | 	.word	0
 | ||||||
|  | 
 | ||||||
|  | 1:	subs	r2, r2, #4		@ 1 do we have enough
 | ||||||
|  | 	blt	5f			@ 1 bytes to align with?
 | ||||||
|  | 	cmp	r3, #2			@ 1
 | ||||||
|  | 	strltb	r1, [r0], #1		@ 1
 | ||||||
|  | 	strleb	r1, [r0], #1		@ 1
 | ||||||
|  | 	strb	r1, [r0], #1		@ 1
 | ||||||
|  | 	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
 | ||||||
|  | /* | ||||||
|  |  * The pointer is now aligned and the length is adjusted.  Try doing the | ||||||
|  |  * memset again. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | .globl memset
 | ||||||
|  | memset: | ||||||
|  | 	ands	r3, r0, #3		@ 1 unaligned?
 | ||||||
|  | 	bne	1b			@ 1
 | ||||||
|  | /* | ||||||
|  |  * we know that the pointer in r0 is aligned to a word boundary. | ||||||
|  |  */ | ||||||
|  | 	orr	r1, r1, r1, lsl #8 | ||||||
|  | 	orr	r1, r1, r1, lsl #16 | ||||||
|  | 	mov	r3, r1 | ||||||
|  | 	cmp	r2, #16 | ||||||
|  | 	blt	4f | ||||||
|  | 
 | ||||||
|  | #if ! CALGN(1)+0 | ||||||
|  | 
 | ||||||
|  | /* | ||||||
|  |  * We need an extra register for this loop - save the return address and | ||||||
|  |  * use the LR | ||||||
|  |  */ | ||||||
|  | 	str	lr, [sp, #-4]! | ||||||
|  | 	mov	ip, r1 | ||||||
|  | 	mov	lr, r1 | ||||||
|  | 
 | ||||||
|  | 2:	subs	r2, r2, #64 | ||||||
|  | 	stmgeia	r0!, {r1, r3, ip, lr}	@ 64 bytes at a time.
 | ||||||
|  | 	stmgeia	r0!, {r1, r3, ip, lr} | ||||||
|  | 	stmgeia	r0!, {r1, r3, ip, lr} | ||||||
|  | 	stmgeia	r0!, {r1, r3, ip, lr} | ||||||
|  | 	bgt	2b | ||||||
|  | 	ldmeqfd	sp!, {pc}		@ Now <64 bytes to go.
 | ||||||
|  | /* | ||||||
|  |  * No need to correct the count; we're only testing bits from now on
 | ||||||
|  |  */ | ||||||
|  | 	tst	r2, #32 | ||||||
|  | 	stmneia	r0!, {r1, r3, ip, lr} | ||||||
|  | 	stmneia	r0!, {r1, r3, ip, lr} | ||||||
|  | 	tst	r2, #16 | ||||||
|  | 	stmneia	r0!, {r1, r3, ip, lr} | ||||||
|  | 	ldr	lr, [sp], #4 | ||||||
|  | 
 | ||||||
|  | #else | ||||||
|  | 
 | ||||||
|  | /* | ||||||
|  |  * This version aligns the destination pointer in order to write | ||||||
|  |  * whole cache lines at once. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | 	stmfd	sp!, {r4-r7, lr} | ||||||
|  | 	mov	r4, r1 | ||||||
|  | 	mov	r5, r1 | ||||||
|  | 	mov	r6, r1 | ||||||
|  | 	mov	r7, r1 | ||||||
|  | 	mov	ip, r1 | ||||||
|  | 	mov	lr, r1 | ||||||
|  | 
 | ||||||
|  | 	cmp	r2, #96 | ||||||
|  | 	tstgt	r0, #31 | ||||||
|  | 	ble	3f | ||||||
|  | 
 | ||||||
|  | 	and	ip, r0, #31 | ||||||
|  | 	rsb	ip, ip, #32 | ||||||
|  | 	sub	r2, r2, ip | ||||||
|  | 	movs	ip, ip, lsl #(32 - 4) | ||||||
|  | 	stmcsia	r0!, {r4, r5, r6, r7} | ||||||
|  | 	stmmiia	r0!, {r4, r5} | ||||||
|  | 	tst	ip, #(1 << 30) | ||||||
|  | 	mov	ip, r1 | ||||||
|  | 	strne	r1, [r0], #4 | ||||||
|  | 
 | ||||||
|  | 3:	subs	r2, r2, #64 | ||||||
|  | 	stmgeia	r0!, {r1, r3-r7, ip, lr} | ||||||
|  | 	stmgeia	r0!, {r1, r3-r7, ip, lr} | ||||||
|  | 	bgt	3b | ||||||
|  | 	ldmeqfd	sp!, {r4-r7, pc} | ||||||
|  | 
 | ||||||
|  | 	tst	r2, #32 | ||||||
|  | 	stmneia	r0!, {r1, r3-r7, ip, lr} | ||||||
|  | 	tst	r2, #16 | ||||||
|  | 	stmneia	r0!, {r4-r7} | ||||||
|  | 	ldmfd	sp!, {r4-r7, lr} | ||||||
|  | 
 | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | 4:	tst	r2, #8 | ||||||
|  | 	stmneia	r0!, {r1, r3} | ||||||
|  | 	tst	r2, #4 | ||||||
|  | 	strne	r1, [r0], #4 | ||||||
|  | /* | ||||||
|  |  * When we get here, we've got less than 4 bytes to zero.  We | ||||||
|  |  * may have an unaligned pointer as well. | ||||||
|  |  */ | ||||||
|  | 5:	tst	r2, #2 | ||||||
|  | 	strneb	r1, [r0], #1 | ||||||
|  | 	strneb	r1, [r0], #1 | ||||||
|  | 	tst	r2, #1 | ||||||
|  | 	strneb	r1, [r0], #1 | ||||||
|  | 	mov	pc, lr | ||||||
		Loading…
	
		Reference in New Issue