From 501046621dc99cee05f6345069d44dd15cc680cd Mon Sep 17 00:00:00 2001 From: Cliff Chen Date: Wed, 2 May 2018 09:31:40 +0800 Subject: [PATCH] crypto: fix NEON bit sliced AES decryption error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Neon bit sliced AES decryption store the key schedule in BS_KEY struct, it introduces a competitive risk,such as dm-crypt without samecpu option. So backport from linux stable version 4.14.13. Change-Id: I8c728669ae626f56b38c24ed391aa3078a60f623 Signed-off-by: Cliff Chen --- arch/arm/crypto/Makefile | 2 +- arch/arm/crypto/aes-neonbs-core.S | 1023 +++++++++++++++++++++++++++++ arch/arm/crypto/aesbs-glue.c | 190 +++--- 3 files changed, 1140 insertions(+), 75 deletions(-) create mode 100644 arch/arm/crypto/aes-neonbs-core.S diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 1d0448a875ee..2c384a48a534 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -26,7 +26,7 @@ endif endif aes-arm-y := aes-armv4.o aes_glue.o -aes-arm-bs-y := aesbs-core.o aesbs-glue.o +aes-arm-bs-y := aes-neonbs-core.o aesbs-core.o aesbs-glue.o sha1-arm-y := sha1-armv4-large.o sha1_glue.o sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o diff --git a/arch/arm/crypto/aes-neonbs-core.S b/arch/arm/crypto/aes-neonbs-core.S new file mode 100644 index 000000000000..2b625c6d4712 --- /dev/null +++ b/arch/arm/crypto/aes-neonbs-core.S @@ -0,0 +1,1023 @@ +/* + * Bit sliced AES using NEON instructions + * + * Copyright (C) 2017 Linaro Ltd. + * Author: Ard Biesheuvel + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * The algorithm implemented here is described in detail by the paper + * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and + * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf) + * + * This implementation is based primarily on the OpenSSL implementation + * for 32-bit ARM written by Andy Polyakov + */ + +#include +#include + + .text + .fpu neon + + rounds .req ip + bskey .req r4 + + q0l .req d0 + q0h .req d1 + q1l .req d2 + q1h .req d3 + q2l .req d4 + q2h .req d5 + q3l .req d6 + q3h .req d7 + q4l .req d8 + q4h .req d9 + q5l .req d10 + q5h .req d11 + q6l .req d12 + q6h .req d13 + q7l .req d14 + q7h .req d15 + q8l .req d16 + q8h .req d17 + q9l .req d18 + q9h .req d19 + q10l .req d20 + q10h .req d21 + q11l .req d22 + q11h .req d23 + q12l .req d24 + q12h .req d25 + q13l .req d26 + q13h .req d27 + q14l .req d28 + q14h .req d29 + q15l .req d30 + q15h .req d31 + + .macro __tbl, out, tbl, in, tmp + .ifc \out, \tbl + .ifb \tmp + .error __tbl needs temp register if out == tbl + .endif + vmov \tmp, \out + .endif + vtbl.8 \out\()l, {\tbl}, \in\()l + .ifc \out, \tbl + vtbl.8 \out\()h, {\tmp}, \in\()h + .else + vtbl.8 \out\()h, {\tbl}, \in\()h + .endif + .endm + + .macro __ldr, out, sym + vldr \out\()l, \sym + vldr \out\()h, \sym + 8 + .endm + + .macro __adr, reg, lbl + adr \reg, \lbl +THUMB( orr \reg, \reg, #1 ) + .endm + + .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 + veor \b2, \b2, \b1 + veor \b5, \b5, \b6 + veor \b3, \b3, \b0 + veor \b6, \b6, \b2 + veor \b5, \b5, \b0 + veor \b6, \b6, \b3 + veor \b3, \b3, \b7 + veor \b7, \b7, \b5 + veor \b3, \b3, \b4 + veor \b4, \b4, \b5 + veor \b2, \b2, \b7 + veor \b3, \b3, \b1 + veor \b1, \b1, \b5 + .endm + + .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 + veor \b0, \b0, \b6 + veor \b1, \b1, \b4 + veor \b4, \b4, \b6 + veor \b2, \b2, \b0 + veor \b6, \b6, \b1 + veor \b1, \b1, \b5 + veor \b5, \b5, \b3 + veor \b3, \b3, \b7 + veor \b7, \b7, \b5 + veor \b2, \b2, \b5 + veor \b4, \b4, \b7 + .endm + + .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5 + veor \b1, \b1, \b7 + veor \b4, \b4, \b7 + veor \b7, \b7, \b5 + veor \b1, \b1, \b3 + veor \b2, \b2, \b5 + veor \b3, \b3, \b7 + veor \b6, \b6, \b1 + veor \b2, \b2, \b0 + veor \b5, \b5, \b3 + veor \b4, \b4, \b6 + veor \b0, \b0, \b6 + veor \b1, \b1, \b4 + .endm + + .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2 + veor \b1, \b1, \b5 + veor \b2, \b2, \b7 + veor \b3, \b3, \b1 + veor \b4, \b4, \b5 + veor \b7, \b7, \b5 + veor \b3, \b3, \b4 + veor \b5, \b5, \b0 + veor \b3, \b3, \b7 + veor \b6, \b6, \b2 + veor \b2, \b2, \b1 + veor \b6, \b6, \b3 + veor \b3, \b3, \b0 + veor \b5, \b5, \b6 + .endm + + .macro mul_gf4, x0, x1, y0, y1, t0, t1 + veor \t0, \y0, \y1 + vand \t0, \t0, \x0 + veor \x0, \x0, \x1 + vand \t1, \x1, \y0 + vand \x0, \x0, \y1 + veor \x1, \t1, \t0 + veor \x0, \x0, \t1 + .endm + + .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1 + veor \t0, \y0, \y1 + veor \t1, \y2, \y3 + vand \t0, \t0, \x0 + vand \t1, \t1, \x2 + veor \x0, \x0, \x1 + veor \x2, \x2, \x3 + vand \x1, \x1, \y0 + vand \x3, \x3, \y2 + vand \x0, \x0, \y1 + vand \x2, \x2, \y3 + veor \x1, \x1, \x0 + veor \x2, \x2, \x3 + veor \x0, \x0, \t0 + veor \x3, \x3, \t1 + .endm + + .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, y1, y2, y3, t0, t1, t2, t3 + veor \t0, \x0, \x2 + veor \t1, \x1, \x3 + mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3 + veor \y0, \y0, \y2 + veor \y1, \y1, \y3 + mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2 + veor \x0, \x0, \t0 + veor \x2, \x2, \t0 + veor \x1, \x1, \t1 + veor \x3, \x3, \t1 + veor \t0, \x4, \x6 + veor \t1, \x5, \x7 + mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2 + veor \y0, \y0, \y2 + veor \y1, \y1, \y3 + mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3 + veor \x4, \x4, \t0 + veor \x6, \x6, \t0 + veor \x5, \x5, \t1 + veor \x7, \x7, \t1 + .endm + + .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \ + t0, t1, t2, t3, s0, s1, s2, s3 + veor \t3, \x4, \x6 + veor \t0, \x5, \x7 + veor \t1, \x1, \x3 + veor \s1, \x7, \x6 + veor \s0, \x0, \x2 + veor \s3, \t3, \t0 + vorr \t2, \t0, \t1 + vand \s2, \t3, \s0 + vorr \t3, \t3, \s0 + veor \s0, \s0, \t1 + vand \t0, \t0, \t1 + veor \t1, \x3, \x2 + vand \s3, \s3, \s0 + vand \s1, \s1, \t1 + veor \t1, \x4, \x5 + veor \s0, \x1, \x0 + veor \t3, \t3, \s1 + veor \t2, \t2, \s1 + vand \s1, \t1, \s0 + vorr \t1, \t1, \s0 + veor \t3, \t3, \s3 + veor \t0, \t0, \s1 + veor \t2, \t2, \s2 + veor \t1, \t1, \s3 + veor \t0, \t0, \s2 + vand \s0, \x7, \x3 + veor \t1, \t1, \s2 + vand \s1, \x6, \x2 + vand \s2, \x5, \x1 + vorr \s3, \x4, \x0 + veor \t3, \t3, \s0 + veor \t1, \t1, \s2 + veor \s0, \t0, \s3 + veor \t2, \t2, \s1 + vand \s2, \t3, \t1 + veor \s1, \t2, \s2 + veor \s3, \s0, \s2 + vbsl \s1, \t1, \s0 + vmvn \t0, \s0 + vbsl \s0, \s1, \s3 + vbsl \t0, \s1, \s3 + vbsl \s3, \t3, \t2 + veor \t3, \t3, \t2 + vand \s2, \s0, \s3 + veor \t1, \t1, \t0 + veor \s2, \s2, \t3 + mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ + \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 + .endm + + .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ + t0, t1, t2, t3, s0, s1, s2, s3 + in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7 + inv_gf256 \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \ + \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 + out_bs_ch \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3 + .endm + + .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ + t0, t1, t2, t3, s0, s1, s2, s3 + inv_in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7 + inv_gf256 \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \ + \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 + inv_out_bs_ch \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6 + .endm + + .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \ + t0, t1, t2, t3, mask + vld1.8 {\t0-\t1}, [bskey, :256]! + veor \t0, \t0, \x0 + vld1.8 {\t2-\t3}, [bskey, :256]! + veor \t1, \t1, \x1 + __tbl \x0, \t0, \mask + veor \t2, \t2, \x2 + __tbl \x1, \t1, \mask + vld1.8 {\t0-\t1}, [bskey, :256]! + veor \t3, \t3, \x3 + __tbl \x2, \t2, \mask + __tbl \x3, \t3, \mask + vld1.8 {\t2-\t3}, [bskey, :256]! + veor \t0, \t0, \x4 + veor \t1, \t1, \x5 + __tbl \x4, \t0, \mask + veor \t2, \t2, \x6 + __tbl \x5, \t1, \mask + veor \t3, \t3, \x7 + __tbl \x6, \t2, \mask + __tbl \x7, \t3, \mask + .endm + + .macro inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \ + t0, t1, t2, t3, mask + __tbl \x0, \x0, \mask, \t0 + __tbl \x1, \x1, \mask, \t1 + __tbl \x2, \x2, \mask, \t2 + __tbl \x3, \x3, \mask, \t3 + __tbl \x4, \x4, \mask, \t0 + __tbl \x5, \x5, \mask, \t1 + __tbl \x6, \x6, \mask, \t2 + __tbl \x7, \x7, \mask, \t3 + .endm + + .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ + t0, t1, t2, t3, t4, t5, t6, t7, inv + vext.8 \t0, \x0, \x0, #12 + vext.8 \t1, \x1, \x1, #12 + veor \x0, \x0, \t0 + vext.8 \t2, \x2, \x2, #12 + veor \x1, \x1, \t1 + vext.8 \t3, \x3, \x3, #12 + veor \x2, \x2, \t2 + vext.8 \t4, \x4, \x4, #12 + veor \x3, \x3, \t3 + vext.8 \t5, \x5, \x5, #12 + veor \x4, \x4, \t4 + vext.8 \t6, \x6, \x6, #12 + veor \x5, \x5, \t5 + vext.8 \t7, \x7, \x7, #12 + veor \x6, \x6, \t6 + veor \t1, \t1, \x0 + veor.8 \x7, \x7, \t7 + vext.8 \x0, \x0, \x0, #8 + veor \t2, \t2, \x1 + veor \t0, \t0, \x7 + veor \t1, \t1, \x7 + vext.8 \x1, \x1, \x1, #8 + veor \t5, \t5, \x4 + veor \x0, \x0, \t0 + veor \t6, \t6, \x5 + veor \x1, \x1, \t1 + vext.8 \t0, \x4, \x4, #8 + veor \t4, \t4, \x3 + vext.8 \t1, \x5, \x5, #8 + veor \t7, \t7, \x6 + vext.8 \x4, \x3, \x3, #8 + veor \t3, \t3, \x2 + vext.8 \x5, \x7, \x7, #8 + veor \t4, \t4, \x7 + vext.8 \x3, \x6, \x6, #8 + veor \t3, \t3, \x7 + vext.8 \x6, \x2, \x2, #8 + veor \x7, \t1, \t5 + .ifb \inv + veor \x2, \t0, \t4 + veor \x4, \x4, \t3 + veor \x5, \x5, \t7 + veor \x3, \x3, \t6 + veor \x6, \x6, \t2 + .else + veor \t3, \t3, \x4 + veor \x5, \x5, \t7 + veor \x2, \x3, \t6 + veor \x3, \t0, \t4 + veor \x4, \x6, \t2 + vmov \x6, \t3 + .endif + .endm + + .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ + t0, t1, t2, t3, t4, t5, t6, t7 + vld1.8 {\t0-\t1}, [bskey, :256]! + veor \x0, \x0, \t0 + vld1.8 {\t2-\t3}, [bskey, :256]! + veor \x1, \x1, \t1 + vld1.8 {\t4-\t5}, [bskey, :256]! + veor \x2, \x2, \t2 + vld1.8 {\t6-\t7}, [bskey, :256] + sub bskey, bskey, #224 + veor \x3, \x3, \t3 + veor \x4, \x4, \t4 + veor \x5, \x5, \t5 + veor \x6, \x6, \t6 + veor \x7, \x7, \t7 + vext.8 \t0, \x0, \x0, #8 + vext.8 \t6, \x6, \x6, #8 + vext.8 \t7, \x7, \x7, #8 + veor \t0, \t0, \x0 + vext.8 \t1, \x1, \x1, #8 + veor \t6, \t6, \x6 + vext.8 \t2, \x2, \x2, #8 + veor \t7, \t7, \x7 + vext.8 \t3, \x3, \x3, #8 + veor \t1, \t1, \x1 + vext.8 \t4, \x4, \x4, #8 + veor \t2, \t2, \x2 + vext.8 \t5, \x5, \x5, #8 + veor \t3, \t3, \x3 + veor \t4, \t4, \x4 + veor \t5, \t5, \x5 + veor \x0, \x0, \t6 + veor \x1, \x1, \t6 + veor \x2, \x2, \t0 + veor \x4, \x4, \t2 + veor \x3, \x3, \t1 + veor \x1, \x1, \t7 + veor \x2, \x2, \t7 + veor \x4, \x4, \t6 + veor \x5, \x5, \t3 + veor \x3, \x3, \t6 + veor \x6, \x6, \t4 + veor \x4, \x4, \t7 + veor \x5, \x5, \t7 + veor \x7, \x7, \t5 + mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ + \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1 + .endm + + .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1 + vshr.u64 \t0, \b0, #\n + vshr.u64 \t1, \b1, #\n + veor \t0, \t0, \a0 + veor \t1, \t1, \a1 + vand \t0, \t0, \mask + vand \t1, \t1, \mask + veor \a0, \a0, \t0 + vshl.s64 \t0, \t0, #\n + veor \a1, \a1, \t1 + vshl.s64 \t1, \t1, #\n + veor \b0, \b0, \t0 + veor \b1, \b1, \t1 + .endm + + .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3 + vmov.i8 \t0, #0x55 + vmov.i8 \t1, #0x33 + swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3 + swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3 + vmov.i8 \t0, #0x0f + swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3 + swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3 + swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3 + swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3 + .endm + + .align 4 +M0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d + + /* + * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) + */ +ENTRY(aesbs_convert_key) + vld1.32 {q7}, [r1]! // load round 0 key + vld1.32 {q15}, [r1]! // load round 1 key + + vmov.i8 q8, #0x01 // bit masks + vmov.i8 q9, #0x02 + vmov.i8 q10, #0x04 + vmov.i8 q11, #0x08 + vmov.i8 q12, #0x10 + vmov.i8 q13, #0x20 + __ldr q14, M0 + + sub r2, r2, #1 + vst1.8 {q7}, [r0, :128]! // save round 0 key + +.Lkey_loop: + __tbl q7, q15, q14 + vmov.i8 q6, #0x40 + vmov.i8 q15, #0x80 + + vtst.8 q0, q7, q8 + vtst.8 q1, q7, q9 + vtst.8 q2, q7, q10 + vtst.8 q3, q7, q11 + vtst.8 q4, q7, q12 + vtst.8 q5, q7, q13 + vtst.8 q6, q7, q6 + vtst.8 q7, q7, q15 + vld1.32 {q15}, [r1]! // load next round key + vmvn q0, q0 + vmvn q1, q1 + vmvn q5, q5 + vmvn q6, q6 + + subs r2, r2, #1 + vst1.8 {q0-q1}, [r0, :256]! + vst1.8 {q2-q3}, [r0, :256]! + vst1.8 {q4-q5}, [r0, :256]! + vst1.8 {q6-q7}, [r0, :256]! + bne .Lkey_loop + + vmov.i8 q7, #0x63 // compose .L63 + veor q15, q15, q7 + vst1.8 {q15}, [r0, :128] + bx lr +ENDPROC(aesbs_convert_key) + + .align 4 +M0SR: .quad 0x0a0e02060f03070b, 0x0004080c05090d01 + +aesbs_encrypt8: + vld1.8 {q9}, [bskey, :128]! // round 0 key + __ldr q8, M0SR + + veor q10, q0, q9 // xor with round0 key + veor q11, q1, q9 + __tbl q0, q10, q8 + veor q12, q2, q9 + __tbl q1, q11, q8 + veor q13, q3, q9 + __tbl q2, q12, q8 + veor q14, q4, q9 + __tbl q3, q13, q8 + veor q15, q5, q9 + __tbl q4, q14, q8 + veor q10, q6, q9 + __tbl q5, q15, q8 + veor q11, q7, q9 + __tbl q6, q10, q8 + __tbl q7, q11, q8 + + bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11 + + sub rounds, rounds, #1 + b .Lenc_sbox + + .align 5 +SR: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b +SRM0: .quad 0x0304090e00050a0f, 0x01060b0c0207080d + +.Lenc_last: + __ldr q12, SRM0 +.Lenc_loop: + shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12 +.Lenc_sbox: + sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \ + q13, q14, q15 + subs rounds, rounds, #1 + bcc .Lenc_done + + mix_cols q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \ + q13, q14, q15 + + beq .Lenc_last + __ldr q12, SR + b .Lenc_loop + +.Lenc_done: + vld1.8 {q12}, [bskey, :128] // last round key + + bitslice q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11 + + veor q0, q0, q12 + veor q1, q1, q12 + veor q4, q4, q12 + veor q6, q6, q12 + veor q3, q3, q12 + veor q7, q7, q12 + veor q2, q2, q12 + veor q5, q5, q12 + bx lr +ENDPROC(aesbs_encrypt8) + + .align 4 +M0ISR: .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 + +aesbs_decrypt8: + add bskey, bskey, rounds, lsl #7 + sub bskey, bskey, #112 + vld1.8 {q9}, [bskey, :128] // round 0 key + sub bskey, bskey, #128 + __ldr q8, M0ISR + + veor q10, q0, q9 // xor with round0 key + veor q11, q1, q9 + __tbl q0, q10, q8 + veor q12, q2, q9 + __tbl q1, q11, q8 + veor q13, q3, q9 + __tbl q2, q12, q8 + veor q14, q4, q9 + __tbl q3, q13, q8 + veor q15, q5, q9 + __tbl q4, q14, q8 + veor q10, q6, q9 + __tbl q5, q15, q8 + veor q11, q7, q9 + __tbl q6, q10, q8 + __tbl q7, q11, q8 + + bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11 + + sub rounds, rounds, #1 + b .Ldec_sbox + + .align 5 +ISR: .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +ISRM0: .quad 0x01040b0e0205080f, 0x0306090c00070a0d + +.Ldec_last: + __ldr q12, ISRM0 +.Ldec_loop: + inv_shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12 +.Ldec_sbox: + inv_sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \ + q13, q14, q15 + subs rounds, rounds, #1 + bcc .Ldec_done + + inv_mix_cols q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \ + q13, q14, q15 + + beq .Ldec_last + __ldr q12, ISR + b .Ldec_loop + +.Ldec_done: + add bskey, bskey, #112 + vld1.8 {q12}, [bskey, :128] // last round key + + bitslice q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11 + + veor q0, q0, q12 + veor q1, q1, q12 + veor q6, q6, q12 + veor q4, q4, q12 + veor q2, q2, q12 + veor q7, q7, q12 + veor q3, q3, q12 + veor q5, q5, q12 + bx lr +ENDPROC(aesbs_decrypt8) + + /* + * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks) + * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks) + */ + .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 + push {r4-r6, lr} + ldr r5, [sp, #16] // number of blocks + +99: __adr ip, 0f + and lr, r5, #7 + cmp r5, #8 + sub ip, ip, lr, lsl #2 + bxlt ip // computed goto if blocks < 8 + + vld1.8 {q0}, [r1]! + vld1.8 {q1}, [r1]! + vld1.8 {q2}, [r1]! + vld1.8 {q3}, [r1]! + vld1.8 {q4}, [r1]! + vld1.8 {q5}, [r1]! + vld1.8 {q6}, [r1]! + vld1.8 {q7}, [r1]! + +0: mov bskey, r2 + mov rounds, r3 + bl \do8 + + __adr ip, 1f + and lr, r5, #7 + cmp r5, #8 + sub ip, ip, lr, lsl #2 + bxlt ip // computed goto if blocks < 8 + + vst1.8 {\o0}, [r0]! + vst1.8 {\o1}, [r0]! + vst1.8 {\o2}, [r0]! + vst1.8 {\o3}, [r0]! + vst1.8 {\o4}, [r0]! + vst1.8 {\o5}, [r0]! + vst1.8 {\o6}, [r0]! + vst1.8 {\o7}, [r0]! + +1: subs r5, r5, #8 + bgt 99b + + pop {r4-r6, pc} + .endm + + .align 4 +ENTRY(aesbs_ecb_encrypt) + __ecb_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5 +ENDPROC(aesbs_ecb_encrypt) + + .align 4 +ENTRY(aesbs_ecb_decrypt) + __ecb_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5 +ENDPROC(aesbs_ecb_decrypt) + + /* + * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], + * int rounds, int blocks, u8 iv[]) + */ + .align 4 +ENTRY(aesbs_cbc_decrypt) + mov ip, sp + push {r4-r6, lr} + ldm ip, {r5-r6} // load args 4-5 + +99: __adr ip, 0f + and lr, r5, #7 + cmp r5, #8 + sub ip, ip, lr, lsl #2 + mov lr, r1 + bxlt ip // computed goto if blocks < 8 + + vld1.8 {q0}, [lr]! + vld1.8 {q1}, [lr]! + vld1.8 {q2}, [lr]! + vld1.8 {q3}, [lr]! + vld1.8 {q4}, [lr]! + vld1.8 {q5}, [lr]! + vld1.8 {q6}, [lr]! + vld1.8 {q7}, [lr] + +0: mov bskey, r2 + mov rounds, r3 + bl aesbs_decrypt8 + + vld1.8 {q8}, [r6] + vmov q9, q8 + vmov q10, q8 + vmov q11, q8 + vmov q12, q8 + vmov q13, q8 + vmov q14, q8 + vmov q15, q8 + + __adr ip, 1f + and lr, r5, #7 + cmp r5, #8 + sub ip, ip, lr, lsl #2 + bxlt ip // computed goto if blocks < 8 + + vld1.8 {q9}, [r1]! + vld1.8 {q10}, [r1]! + vld1.8 {q11}, [r1]! + vld1.8 {q12}, [r1]! + vld1.8 {q13}, [r1]! + vld1.8 {q14}, [r1]! + vld1.8 {q15}, [r1]! + W(nop) + +1: __adr ip, 2f + sub ip, ip, lr, lsl #3 + bxlt ip // computed goto if blocks < 8 + + veor q0, q0, q8 + vst1.8 {q0}, [r0]! + veor q1, q1, q9 + vst1.8 {q1}, [r0]! + veor q6, q6, q10 + vst1.8 {q6}, [r0]! + veor q4, q4, q11 + vst1.8 {q4}, [r0]! + veor q2, q2, q12 + vst1.8 {q2}, [r0]! + veor q7, q7, q13 + vst1.8 {q7}, [r0]! + veor q3, q3, q14 + vst1.8 {q3}, [r0]! + veor q5, q5, q15 + vld1.8 {q8}, [r1]! // load next round's iv +2: vst1.8 {q5}, [r0]! + + subs r5, r5, #8 + vst1.8 {q8}, [r6] // store next round's iv + bgt 99b + + pop {r4-r6, pc} +ENDPROC(aesbs_cbc_decrypt) + + .macro next_ctr, q + vmov.32 \q\()h[1], r10 + adds r10, r10, #1 + vmov.32 \q\()h[0], r9 + adcs r9, r9, #0 + vmov.32 \q\()l[1], r8 + adcs r8, r8, #0 + vmov.32 \q\()l[0], r7 + adc r7, r7, #0 + vrev32.8 \q, \q + .endm + + /* + * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], + * int rounds, int blocks, u8 ctr[], u8 final[]) + */ +ENTRY(aesbs_ctr_encrypt) + mov ip, sp + push {r4-r10, lr} + + ldm ip, {r5-r7} // load args 4-6 + teq r7, #0 + addne r5, r5, #1 // one extra block if final != 0 + + vld1.8 {q0}, [r6] // load counter + vrev32.8 q1, q0 + vmov r9, r10, d3 + vmov r7, r8, d2 + + adds r10, r10, #1 + adcs r9, r9, #0 + adcs r8, r8, #0 + adc r7, r7, #0 + +99: vmov q1, q0 + vmov q2, q0 + vmov q3, q0 + vmov q4, q0 + vmov q5, q0 + vmov q6, q0 + vmov q7, q0 + + __adr ip, 0f + sub lr, r5, #1 + and lr, lr, #7 + cmp r5, #8 + sub ip, ip, lr, lsl #5 + sub ip, ip, lr, lsl #2 + bxlt ip // computed goto if blocks < 8 + + next_ctr q1 + next_ctr q2 + next_ctr q3 + next_ctr q4 + next_ctr q5 + next_ctr q6 + next_ctr q7 + +0: mov bskey, r2 + mov rounds, r3 + bl aesbs_encrypt8 + + __adr ip, 1f + and lr, r5, #7 + cmp r5, #8 + movgt r4, #0 + ldrle r4, [sp, #40] // load final in the last round + sub ip, ip, lr, lsl #2 + bxlt ip // computed goto if blocks < 8 + + vld1.8 {q8}, [r1]! + vld1.8 {q9}, [r1]! + vld1.8 {q10}, [r1]! + vld1.8 {q11}, [r1]! + vld1.8 {q12}, [r1]! + vld1.8 {q13}, [r1]! + vld1.8 {q14}, [r1]! + teq r4, #0 // skip last block if 'final' +1: bne 2f + vld1.8 {q15}, [r1]! + +2: __adr ip, 3f + cmp r5, #8 + sub ip, ip, lr, lsl #3 + bxlt ip // computed goto if blocks < 8 + + veor q0, q0, q8 + vst1.8 {q0}, [r0]! + veor q1, q1, q9 + vst1.8 {q1}, [r0]! + veor q4, q4, q10 + vst1.8 {q4}, [r0]! + veor q6, q6, q11 + vst1.8 {q6}, [r0]! + veor q3, q3, q12 + vst1.8 {q3}, [r0]! + veor q7, q7, q13 + vst1.8 {q7}, [r0]! + veor q2, q2, q14 + vst1.8 {q2}, [r0]! + teq r4, #0 // skip last block if 'final' + W(bne) 5f +3: veor q5, q5, q15 + vst1.8 {q5}, [r0]! + +4: next_ctr q0 + + subs r5, r5, #8 + bgt 99b + + vst1.8 {q0}, [r6] + pop {r4-r10, pc} + +5: vst1.8 {q5}, [r4] + b 4b +ENDPROC(aesbs_ctr_encrypt) + + .macro next_tweak, out, in, const, tmp + vshr.s64 \tmp, \in, #63 + vand \tmp, \tmp, \const + vadd.u64 \out, \in, \in + vext.8 \tmp, \tmp, \tmp, #8 + veor \out, \out, \tmp + .endm + + .align 4 +.Lxts_mul_x: + .quad 1, 0x87 + + /* + * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + */ +__xts_prepare8: + vld1.8 {q14}, [r7] // load iv + __ldr q15, .Lxts_mul_x // load tweak mask + vmov q12, q14 + + __adr ip, 0f + and r4, r6, #7 + cmp r6, #8 + sub ip, ip, r4, lsl #5 + mov r4, sp + bxlt ip // computed goto if blocks < 8 + + vld1.8 {q0}, [r1]! + next_tweak q12, q14, q15, q13 + veor q0, q0, q14 + vst1.8 {q14}, [r4, :128]! + + vld1.8 {q1}, [r1]! + next_tweak q14, q12, q15, q13 + veor q1, q1, q12 + vst1.8 {q12}, [r4, :128]! + + vld1.8 {q2}, [r1]! + next_tweak q12, q14, q15, q13 + veor q2, q2, q14 + vst1.8 {q14}, [r4, :128]! + + vld1.8 {q3}, [r1]! + next_tweak q14, q12, q15, q13 + veor q3, q3, q12 + vst1.8 {q12}, [r4, :128]! + + vld1.8 {q4}, [r1]! + next_tweak q12, q14, q15, q13 + veor q4, q4, q14 + vst1.8 {q14}, [r4, :128]! + + vld1.8 {q5}, [r1]! + next_tweak q14, q12, q15, q13 + veor q5, q5, q12 + vst1.8 {q12}, [r4, :128]! + + vld1.8 {q6}, [r1]! + next_tweak q12, q14, q15, q13 + veor q6, q6, q14 + vst1.8 {q14}, [r4, :128]! + + vld1.8 {q7}, [r1]! + next_tweak q14, q12, q15, q13 + veor q7, q7, q12 + vst1.8 {q12}, [r4, :128] + +0: vst1.8 {q14}, [r7] // store next iv + bx lr +ENDPROC(__xts_prepare8) + + .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 + push {r4-r8, lr} + mov r5, sp // preserve sp + ldrd r6, r7, [sp, #24] // get blocks and iv args + sub ip, sp, #128 // make room for 8x tweak + bic ip, ip, #0xf // align sp to 16 bytes + mov sp, ip + +99: bl __xts_prepare8 + + mov bskey, r2 + mov rounds, r3 + bl \do8 + + __adr ip, 0f + and lr, r6, #7 + cmp r6, #8 + sub ip, ip, lr, lsl #2 + mov r4, sp + bxlt ip // computed goto if blocks < 8 + + vld1.8 {q8}, [r4, :128]! + vld1.8 {q9}, [r4, :128]! + vld1.8 {q10}, [r4, :128]! + vld1.8 {q11}, [r4, :128]! + vld1.8 {q12}, [r4, :128]! + vld1.8 {q13}, [r4, :128]! + vld1.8 {q14}, [r4, :128]! + vld1.8 {q15}, [r4, :128] + +0: __adr ip, 1f + sub ip, ip, lr, lsl #3 + bxlt ip // computed goto if blocks < 8 + + veor \o0, \o0, q8 + vst1.8 {\o0}, [r0]! + veor \o1, \o1, q9 + vst1.8 {\o1}, [r0]! + veor \o2, \o2, q10 + vst1.8 {\o2}, [r0]! + veor \o3, \o3, q11 + vst1.8 {\o3}, [r0]! + veor \o4, \o4, q12 + vst1.8 {\o4}, [r0]! + veor \o5, \o5, q13 + vst1.8 {\o5}, [r0]! + veor \o6, \o6, q14 + vst1.8 {\o6}, [r0]! + veor \o7, \o7, q15 + vst1.8 {\o7}, [r0]! + +1: subs r6, r6, #8 + bgt 99b + + mov sp, r5 + pop {r4-r8, pc} + .endm + +ENTRY(aesbs_xts_encrypt) + __xts_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5 +ENDPROC(aesbs_xts_encrypt) + +ENTRY(aesbs_xts_decrypt) + __xts_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5 +ENDPROC(aesbs_xts_decrypt) diff --git a/arch/arm/crypto/aesbs-glue.c b/arch/arm/crypto/aesbs-glue.c index 648d5fac9cbf..8234d563f050 100644 --- a/arch/arm/crypto/aesbs-glue.c +++ b/arch/arm/crypto/aesbs-glue.c @@ -24,11 +24,13 @@ struct BS_KEY { u8 __aligned(8) bs[BIT_SLICED_KEY_MAXSIZE]; } __aligned(8); +asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds); +asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[]); + asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in); asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in); -asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes, - struct BS_KEY *key, u8 iv[]); asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks, struct BS_KEY *key, u8 const iv[]); @@ -39,9 +41,14 @@ asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes, asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes, struct BS_KEY *key, u8 tweak[]); +struct aesbs_ctx { + int rounds; + u8 rk[13 * (8 * AES_BLOCK_SIZE) + 32] __aligned(AES_BLOCK_SIZE); +}; + struct aesbs_cbc_ctx { - struct AES_KEY enc; - struct BS_KEY dec; + struct aesbs_ctx key; + struct crypto_cipher *enc_tfm; }; struct aesbs_ctr_ctx { @@ -58,16 +65,20 @@ static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm); - int bits = key_len * 8; + struct crypto_aes_ctx rk; + int err; - if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) { - tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } - ctx->dec.rk = ctx->enc; - private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk); - ctx->dec.converted = 0; - return 0; + err = crypto_aes_expand_key(&rk, in_key, key_len); + if (err) + return err; + + ctx->key.rounds = 6 + key_len / 4; + + kernel_neon_begin(); + aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds); + kernel_neon_end(); + + return crypto_cipher_setkey(ctx->enc_tfm, in_key, key_len); } static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key, @@ -101,11 +112,62 @@ static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key, return 0; } -static int aesbs_cbc_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static inline int crypto_cbc_encrypt_segment(struct blkcipher_walk *walk, + struct crypto_blkcipher *tfm, + void (*fn)(struct crypto_blkcipher *, const u8 *, u8 *)) +{ + unsigned int bsize = AES_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + u8 *iv = walk->iv; + + do { + crypto_xor(iv, src, bsize); + fn(tfm, iv, dst); + memcpy(iv, dst, bsize); + + src += bsize; + dst += bsize; + } while ((nbytes -= bsize) >= bsize); + + return nbytes; +} + +static inline int crypto_cbc_encrypt_inplace(struct blkcipher_walk *walk, + struct crypto_blkcipher *tfm, + void (*fn)(struct crypto_blkcipher *, const u8 *, u8 *)) +{ + unsigned int bsize = AES_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u8 *src = walk->src.virt.addr; + u8 *iv = walk->iv; + + do { + crypto_xor(src, iv, bsize); + fn(tfm, src, src); + iv = src; + + src += bsize; + } while ((nbytes -= bsize) >= bsize); + + memcpy(walk->iv, iv, bsize); + + return nbytes; +} + +static void cbc_encrypt_one(struct crypto_blkcipher *tfm, const u8 *src, + u8 *dst) +{ + struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(tfm); + + crypto_cipher_encrypt_one(ctx->enc_tfm, dst, src); +} + +static int cbc_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; int err; @@ -113,38 +175,21 @@ static int aesbs_cbc_encrypt(struct blkcipher_desc *desc, err = blkcipher_walk_virt(desc, &walk); while (walk.nbytes) { - u32 blocks = walk.nbytes / AES_BLOCK_SIZE; - u8 *src = walk.src.virt.addr; - - if (walk.dst.virt.addr == walk.src.virt.addr) { - u8 *iv = walk.iv; - - do { - crypto_xor(src, iv, AES_BLOCK_SIZE); - AES_encrypt(src, src, &ctx->enc); - iv = src; - src += AES_BLOCK_SIZE; - } while (--blocks); - memcpy(walk.iv, iv, AES_BLOCK_SIZE); - } else { - u8 *dst = walk.dst.virt.addr; - - do { - crypto_xor(walk.iv, src, AES_BLOCK_SIZE); - AES_encrypt(walk.iv, dst, &ctx->enc); - memcpy(walk.iv, dst, AES_BLOCK_SIZE); - src += AES_BLOCK_SIZE; - dst += AES_BLOCK_SIZE; - } while (--blocks); - } + if (walk.src.virt.addr == walk.dst.virt.addr) + err = crypto_cbc_encrypt_inplace(&walk, desc->tfm, + cbc_encrypt_one); + else + err = crypto_cbc_encrypt_segment(&walk, desc->tfm, + cbc_encrypt_one); err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); } + return err; } -static int aesbs_cbc_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; @@ -153,40 +198,35 @@ static int aesbs_cbc_decrypt(struct blkcipher_desc *desc, blkcipher_walk_init(&walk, dst, src, nbytes); err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE); - while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) { - kernel_neon_begin(); - bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr, - walk.nbytes, &ctx->dec, walk.iv); - kernel_neon_end(); + kernel_neon_begin(); + while (walk.nbytes >= AES_BLOCK_SIZE) { + unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; + + aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key.rk, ctx->key.rounds, blocks, + walk.iv); err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); } - while (walk.nbytes) { - u32 blocks = walk.nbytes / AES_BLOCK_SIZE; - u8 *dst = walk.dst.virt.addr; - u8 *src = walk.src.virt.addr; - u8 bk[2][AES_BLOCK_SIZE]; - u8 *iv = walk.iv; + kernel_neon_end(); - do { - if (walk.dst.virt.addr == walk.src.virt.addr) - memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE); - - AES_decrypt(src, dst, &ctx->dec.rk); - crypto_xor(dst, iv, AES_BLOCK_SIZE); - - if (walk.dst.virt.addr == walk.src.virt.addr) - iv = bk[blocks & 1]; - else - iv = src; - - dst += AES_BLOCK_SIZE; - src += AES_BLOCK_SIZE; - } while (--blocks); - err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); - } return err; } +static int cbc_init(struct crypto_tfm *tfm) +{ + struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->enc_tfm = crypto_alloc_cipher("aes", 0, 0); + return PTR_RET(ctx->enc_tfm); +} + +static void cbc_exit(struct crypto_tfm *tfm) +{ + struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm); + + crypto_free_cipher(ctx->enc_tfm); +} + static void inc_be128_ctr(__be32 ctr[], u32 addend) { int i; @@ -308,13 +348,15 @@ static struct crypto_alg aesbs_algs[] = { { .cra_alignmask = 7, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, + .cra_init = cbc_init, + .cra_exit = cbc_exit, .cra_blkcipher = { .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .setkey = aesbs_cbc_set_key, - .encrypt = aesbs_cbc_encrypt, - .decrypt = aesbs_cbc_decrypt, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, }, }, { .cra_name = "__ctr-aes-neonbs",