RAID/s390: add SIMD implementation for raid6 gen/xor

Using vector registers is slightly faster:

raid6: vx128x8  gen() 19705 MB/s
raid6: vx128x8  xor() 11886 MB/s
raid6: using algorithm vx128x8 gen() 19705 MB/s
raid6: .... xor() 11886 MB/s, rmw enabled

vs the software algorithms:

raid6: int64x1  gen()  3018 MB/s
raid6: int64x1  xor()  1429 MB/s
raid6: int64x2  gen()  4661 MB/s
raid6: int64x2  xor()  3143 MB/s
raid6: int64x4  gen()  5392 MB/s
raid6: int64x4  xor()  3509 MB/s
raid6: int64x8  gen()  4441 MB/s
raid6: int64x8  xor()  3207 MB/s
raid6: using algorithm int64x4 gen() 5392 MB/s
raid6: .... xor() 3509 MB/s, rmw enabled

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
This commit is contained in:
Martin Schwidefsky 2016-08-23 13:30:24 +02:00
parent 8f149ea6e9
commit 474fd6e80f
6 changed files with 265 additions and 0 deletions

View File

@ -278,6 +278,15 @@
VLVG \v, \gr, \index, 3
.endm
/* VECTOR LOAD REGISTER */
.macro VLR v1, v2
VX_NUM v1, \v1
VX_NUM v2, \v2
.word 0xE700 | ((v1&15) << 4) | (v2&15)
.word 0
MRXBOPC 0, 0x56, v1, v2
.endm
/* VECTOR LOAD */
.macro VL v, disp, index="%r0", base
VX_NUM v1, \v
@ -404,6 +413,16 @@
/* Vector integer instructions */
/* VECTOR AND */
.macro VN vr1, vr2, vr3
VX_NUM v1, \vr1
VX_NUM v2, \vr2
VX_NUM v3, \vr3
.word 0xE700 | ((v1&15) << 4) | (v2&15)
.word ((v3&15) << 12)
MRXBOPC 0, 0x68, v1, v2, v3
.endm
/* VECTOR EXCLUSIVE OR */
.macro VX vr1, vr2, vr3
VX_NUM v1, \vr1
@ -469,6 +488,73 @@
MRXBOPC 0, 0x7D, v1, v2, v3
.endm
/* VECTOR REPLICATE IMMEDIATE */
.macro VREPI vr1, imm2, m3
VX_NUM v1, \vr1
.word 0xE700 | ((v1&15) << 4)
.word \imm2
MRXBOPC \m3, 0x45, v1
.endm
.macro VREPIB vr1, imm2
VREPI \vr1, \imm2, 0
.endm
.macro VREPIH vr1, imm2
VREPI \vr1, \imm2, 1
.endm
.macro VREPIF vr1, imm2
VREPI \vr1, \imm2, 2
.endm
.macro VREPIG vr1, imm2
VREP \vr1, \imm2, 3
.endm
/* VECTOR ADD */
.macro VA vr1, vr2, vr3, m4
VX_NUM v1, \vr1
VX_NUM v2, \vr2
VX_NUM v3, \vr3
.word 0xE700 | ((v1&15) << 4) | (v2&15)
.word ((v3&15) << 12)
MRXBOPC \m4, 0xF3, v1, v2, v3
.endm
.macro VAB vr1, vr2, vr3
VA \vr1, \vr2, \vr3, 0
.endm
.macro VAH vr1, vr2, vr3
VA \vr1, \vr2, \vr3, 1
.endm
.macro VAF vr1, vr2, vr3
VA \vr1, \vr2, \vr3, 2
.endm
.macro VAG vr1, vr2, vr3
VA \vr1, \vr2, \vr3, 3
.endm
.macro VAQ vr1, vr2, vr3
VA \vr1, \vr2, \vr3, 4
.endm
/* VECTOR ELEMENT SHIFT RIGHT ARITHMETIC */
.macro VESRAV vr1, vr2, vr3, m4
VX_NUM v1, \vr1
VX_NUM v2, \vr2
VX_NUM v3, \vr3
.word 0xE700 | ((v1&15) << 4) | (v2&15)
.word ((v3&15) << 12)
MRXBOPC \m4, 0x7A, v1, v2, v3
.endm
.macro VESRAVB vr1, vr2, vr3
VESRAV \vr1, \vr2, \vr3, 0
.endm
.macro VESRAVH vr1, vr2, vr3
VESRAV \vr1, \vr2, \vr3, 1
.endm
.macro VESRAVF vr1, vr2, vr3
VESRAV \vr1, \vr2, \vr3, 2
.endm
.macro VESRAVG vr1, vr2, vr3
VESRAV \vr1, \vr2, \vr3, 3
.endm
#endif /* __ASSEMBLY__ */
#endif /* __ASM_S390_VX_INSN_H */

View File

@ -103,6 +103,7 @@ extern const struct raid6_calls raid6_avx2x1;
extern const struct raid6_calls raid6_avx2x2;
extern const struct raid6_calls raid6_avx2x4;
extern const struct raid6_calls raid6_tilegx8;
extern const struct raid6_calls raid6_s390vx8;
struct raid6_recov_calls {
void (*data2)(int, size_t, int, int, void **);

View File

@ -3,3 +3,4 @@ altivec*.c
int*.c
tables.c
neon?.c
s390vx?.c

View File

@ -7,6 +7,7 @@ raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
raid6_pq-$(CONFIG_S390) += s390vx8.o
hostprogs-y += mktables
@ -116,6 +117,11 @@ $(obj)/tilegx8.c: UNROLL := 8
$(obj)/tilegx8.c: $(src)/tilegx.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
targets += s390vx8.c
$(obj)/s390vx8.c: UNROLL := 8
$(obj)/s390vx8.c: $(src)/s390vx.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
quiet_cmd_mktable = TABLE $@
cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )

View File

@ -68,6 +68,9 @@ const struct raid6_calls * const raid6_algos[] = {
#endif
#if defined(CONFIG_TILEGX)
&raid6_tilegx8,
#endif
#if defined(CONFIG_S390)
&raid6_s390vx8,
#endif
&raid6_intx1,
&raid6_intx2,

168
lib/raid6/s390vx.uc Normal file
View File

@ -0,0 +1,168 @@
/*
* raid6_vx$#.c
*
* $#-way unrolled RAID6 gen/xor functions for s390
* based on the vector facility
*
* Copyright IBM Corp. 2016
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
*
* This file is postprocessed using unroll.awk.
*/
#include <linux/raid/pq.h>
#include <asm/fpu/api.h>
asm(".include \"asm/vx-insn.h\"\n");
#define NSIZE 16
static inline void LOAD_CONST(void)
{
asm volatile("VREPIB %v24,7");
asm volatile("VREPIB %v25,0x1d");
}
/*
* The SHLBYTE() operation shifts each of the 16 bytes in
* vector register y left by 1 bit and stores the result in
* vector register x.
*/
static inline void SHLBYTE(int x, int y)
{
asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y));
}
/*
* For each of the 16 bytes in the vector register y the MASK()
* operation returns 0xFF if the high bit of the byte is 1,
* or 0x00 if the high bit is 0. The result is stored in vector
* register x.
*/
static inline void MASK(int x, int y)
{
asm volatile ("VESRAVB %0,%1,24" : : "i" (x), "i" (y));
}
static inline void AND(int x, int y, int z)
{
asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
}
static inline void XOR(int x, int y, int z)
{
asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
}
static inline void LOAD_DATA(int x, int n, u8 *ptr)
{
typedef struct { u8 _[16*n]; } addrtype;
register addrtype *__ptr asm("1") = (addrtype *) ptr;
asm volatile ("VLM %2,%3,0,%r1"
: : "m" (*__ptr), "a" (__ptr), "i" (x), "i" (x + n - 1));
}
static inline void STORE_DATA(int x, int n, u8 *ptr)
{
typedef struct { u8 _[16*n]; } addrtype;
register addrtype *__ptr asm("1") = (addrtype *) ptr;
asm volatile ("VSTM %2,%3,0,1"
: "=m" (*__ptr) : "a" (__ptr), "i" (x), "i" (x + n - 1));
}
static inline void COPY_VEC(int x, int y)
{
asm volatile ("VLR %0,%1" : : "i" (x), "i" (y));
}
static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
{
struct kernel_fpu vxstate;
u8 **dptr, *p, *q;
int d, z, z0;
kernel_fpu_begin(&vxstate, KERNEL_VXR);
LOAD_CONST();
dptr = (u8 **) ptrs;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0 + 1]; /* XOR parity */
q = dptr[z0 + 2]; /* RS syndrome */
for (d = 0; d < bytes; d += $#*NSIZE) {
LOAD_DATA(0,$#,&dptr[z0][d]);
COPY_VEC(8+$$,0+$$);
for (z = z0 - 1; z >= 0; z--) {
MASK(16+$$,8+$$);
AND(16+$$,16+$$,25);
SHLBYTE(8+$$,8+$$);
XOR(8+$$,8+$$,16+$$);
LOAD_DATA(16,$#,&dptr[z][d]);
XOR(0+$$,0+$$,16+$$);
XOR(8+$$,8+$$,16+$$);
}
STORE_DATA(0,$#,&p[d]);
STORE_DATA(8,$#,&q[d]);
}
kernel_fpu_end(&vxstate, KERNEL_VXR);
}
static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop,
size_t bytes, void **ptrs)
{
struct kernel_fpu vxstate;
u8 **dptr, *p, *q;
int d, z, z0;
dptr = (u8 **) ptrs;
z0 = stop; /* P/Q right side optimization */
p = dptr[disks - 2]; /* XOR parity */
q = dptr[disks - 1]; /* RS syndrome */
kernel_fpu_begin(&vxstate, KERNEL_VXR);
LOAD_CONST();
for (d = 0; d < bytes; d += $#*NSIZE) {
/* P/Q data pages */
LOAD_DATA(0,$#,&dptr[z0][d]);
COPY_VEC(8+$$,0+$$);
for (z = z0 - 1; z >= start; z--) {
MASK(16+$$,8+$$);
AND(16+$$,16+$$,25);
SHLBYTE(8+$$,8+$$);
XOR(8+$$,8+$$,16+$$);
LOAD_DATA(16,$#,&dptr[z][d]);
XOR(0+$$,0+$$,16+$$);
XOR(8+$$,8+$$,16+$$);
}
/* P/Q left side optimization */
for (z = start - 1; z >= 0; z--) {
MASK(16+$$,8+$$);
AND(16+$$,16+$$,25);
SHLBYTE(8+$$,8+$$);
XOR(8+$$,8+$$,16+$$);
}
LOAD_DATA(16,$#,&p[d]);
XOR(16+$$,16+$$,0+$$);
STORE_DATA(16,$#,&p[d]);
LOAD_DATA(16,$#,&q[d]);
XOR(16+$$,16+$$,8+$$);
STORE_DATA(16,$#,&q[d]);
}
kernel_fpu_end(&vxstate, KERNEL_VXR);
}
static int raid6_s390vx$#_valid(void)
{
return MACHINE_HAS_VX;
}
const struct raid6_calls raid6_s390vx$# = {
raid6_s390vx$#_gen_syndrome,
raid6_s390vx$#_xor_syndrome,
raid6_s390vx$#_valid,
"vx128x$#",
1
};