ARM: 8745/1: get rid of __memzero()

The __memzero assembly code is almost identical to memset's except for
two orr instructions. The runtime performance of __memset(p, n) and
memset(p, 0, n) is accordingly almost identical.

However, the memset() macro used to guard against a zero length and to
call __memzero at compile time when the fill value is a constant zero
interferes with compiler optimizations.

Arnd found tha the test against a zero length brings up some new
warnings with gcc v8:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82103

And successively rremoving the test against a zero length and the call
to __memzero optimization produces the following kernel sizes for
defconfig with gcc 6:

    text     data     bss       dec       hex  filename
12248142  6278960  413588  18940690   1210312  vmlinux.orig
12244474  6278960  413588  18937022   120f4be  vmlinux.no_zero_test
12239160  6278960  413588  18931708   120dffc  vmlinux.no_memzero

So it is probably not worth keeping __memzero around given that the
compiler can do a better job at inlining trivial memset(p,0,n) on its
own. And the memset code already handles a zero length just fine.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Nicolas Pitre <nico@linaro.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
This commit is contained in:
Nicolas Pitre 2018-01-19 18:17:46 +01:00 committed by Russell King
parent ec80eb4671
commit ff5fdafc9e
7 changed files with 5 additions and 161 deletions

View File

@ -130,8 +130,3 @@ void *memset(void *s, int c, size_t count)
*xs++ = c;
return s;
}
void __memzero(void *s, size_t count)
{
memset(s, 0, count);
}

View File

@ -39,18 +39,4 @@ static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n)
return __memset64(p, v, n * 8, v >> 32);
}
extern void __memzero(void *ptr, __kernel_size_t n);
#define memset(p,v,n) \
({ \
void *__p = (p); size_t __n = n; \
if ((__n) != 0) { \
if (__builtin_constant_p((v)) && (v) == 0) \
__memzero((__p),(__n)); \
else \
memset((__p),(v),(__n)); \
} \
(__p); \
})
#endif

View File

@ -92,7 +92,6 @@ EXPORT_SYMBOL(__memset64);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(memmove);
EXPORT_SYMBOL(memchr);
EXPORT_SYMBOL(__memzero);
EXPORT_SYMBOL(mmioset);
EXPORT_SYMBOL(mmiocpy);

View File

@ -105,8 +105,9 @@ __mmap_switched:
ARM( ldmia r4!, {r0, r1, sp} )
THUMB( ldmia r4!, {r0, r1, r3} )
THUMB( mov sp, r3 )
sub r1, r1, r0
bl __memzero @ clear .bss
sub r2, r1, r0
mov r1, #0
bl memset @ clear .bss
ldmia r4, {r0, r1, r2, r3}
str r9, [r0] @ Save processor ID

View File

@ -8,7 +8,7 @@
lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
delay.o delay-loop.o findbit.o memchr.o memcpy.o \
memmove.o memset.o memzero.o setbit.o \
memmove.o memset.o setbit.o \
strchr.o strrchr.o \
testchangebit.o testclearbit.o testsetbit.o \
ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \

View File

@ -1,137 +0,0 @@
/*
* linux/arch/arm/lib/memzero.S
*
* Copyright (C) 1995-2000 Russell King
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/unwind.h>
.text
.align 5
.word 0
/*
* Align the pointer in r0. r3 contains the number of bytes that we are
* mis-aligned by, and r1 is the number of bytes. If r1 < 4, then we
* don't bother; we use byte stores instead.
*/
UNWIND( .fnstart )
1: subs r1, r1, #4 @ 1 do we have enough
blt 5f @ 1 bytes to align with?
cmp r3, #2 @ 1
strltb r2, [r0], #1 @ 1
strleb r2, [r0], #1 @ 1
strb r2, [r0], #1 @ 1
add r1, r1, r3 @ 1 (r1 = r1 - (4 - r3))
/*
* The pointer is now aligned and the length is adjusted. Try doing the
* memzero again.
*/
ENTRY(__memzero)
mov r2, #0 @ 1
ands r3, r0, #3 @ 1 unaligned?
bne 1b @ 1
/*
* r3 = 0, and we know that the pointer in r0 is aligned to a word boundary.
*/
cmp r1, #16 @ 1 we can skip this chunk if we
blt 4f @ 1 have < 16 bytes
#if ! CALGN(1)+0
/*
* We need an extra register for this loop - save the return address and
* use the LR
*/
str lr, [sp, #-4]! @ 1
UNWIND( .fnend )
UNWIND( .fnstart )
UNWIND( .save {lr} )
mov ip, r2 @ 1
mov lr, r2 @ 1
3: subs r1, r1, #64 @ 1 write 32 bytes out per loop
stmgeia r0!, {r2, r3, ip, lr} @ 4
stmgeia r0!, {r2, r3, ip, lr} @ 4
stmgeia r0!, {r2, r3, ip, lr} @ 4
stmgeia r0!, {r2, r3, ip, lr} @ 4
bgt 3b @ 1
ldmeqfd sp!, {pc} @ 1/2 quick exit
/*
* No need to correct the count; we're only testing bits from now on
*/
tst r1, #32 @ 1
stmneia r0!, {r2, r3, ip, lr} @ 4
stmneia r0!, {r2, r3, ip, lr} @ 4
tst r1, #16 @ 1 16 bytes or more?
stmneia r0!, {r2, r3, ip, lr} @ 4
ldr lr, [sp], #4 @ 1
UNWIND( .fnend )
#else
/*
* This version aligns the destination pointer in order to write
* whole cache lines at once.
*/
stmfd sp!, {r4-r7, lr}
UNWIND( .fnend )
UNWIND( .fnstart )
UNWIND( .save {r4-r7, lr} )
mov r4, r2
mov r5, r2
mov r6, r2
mov r7, r2
mov ip, r2
mov lr, r2
cmp r1, #96
andgts ip, r0, #31
ble 3f
rsb ip, ip, #32
sub r1, r1, ip
movs ip, ip, lsl #(32 - 4)
stmcsia r0!, {r4, r5, r6, r7}
stmmiia r0!, {r4, r5}
movs ip, ip, lsl #2
strcs r2, [r0], #4
3: subs r1, r1, #64
stmgeia r0!, {r2-r7, ip, lr}
stmgeia r0!, {r2-r7, ip, lr}
bgt 3b
ldmeqfd sp!, {r4-r7, pc}
tst r1, #32
stmneia r0!, {r2-r7, ip, lr}
tst r1, #16
stmneia r0!, {r4-r7}
ldmfd sp!, {r4-r7, lr}
UNWIND( .fnend )
#endif
UNWIND( .fnstart )
4: tst r1, #8 @ 1 8 bytes or more?
stmneia r0!, {r2, r3} @ 2
tst r1, #4 @ 1 4 bytes or more?
strne r2, [r0], #4 @ 1
/*
* When we get here, we've got less than 4 bytes to zero. We
* may have an unaligned pointer as well.
*/
5: tst r1, #2 @ 1 2 bytes or more?
strneb r2, [r0], #1 @ 1
strneb r2, [r0], #1 @ 1
tst r1, #1 @ 1 a byte left over
strneb r2, [r0], #1 @ 1
ret lr @ 1
UNWIND( .fnend )
ENDPROC(__memzero)

View File

@ -765,7 +765,7 @@ static int imxdma_alloc_chan_resources(struct dma_chan *chan)
desc = kzalloc(sizeof(*desc), GFP_KERNEL);
if (!desc)
break;
__memzero(&desc->desc, sizeof(struct dma_async_tx_descriptor));
memset(&desc->desc, 0, sizeof(struct dma_async_tx_descriptor));
dma_async_tx_descriptor_init(&desc->desc, chan);
desc->desc.tx_submit = imxdma_tx_submit;
/* txd.flags will be overwritten in prep funcs */