mirror of
https://github.com/hardkernel/linux.git
synced 2026-06-07 03:15:31 +09:00
Merge tag 'ASB-2020-11-05_4.19-stable' of https://android.googlesource.com/kernel/common
https://source.android.com/security/bulletin/2020-11-01 CVE-2020-0423 * tag 'ASB-2020-11-05_4.19-stable': (529 commits) ANDROID: GKI: Enable DEBUG_INFO_DWARF4 UPSTREAM: mm/sl[uo]b: export __kmalloc_track(_node)_caller BACKPORT: xfrm/compat: Translate 32-bit user_policy from sockptr BACKPORT: xfrm/compat: Add 32=>64-bit messages translator UPSTREAM: xfrm/compat: Attach xfrm dumps to 64=>32 bit translator UPSTREAM: xfrm/compat: Add 64=>32-bit messages translator BACKPORT: xfrm: Provide API to register translator module ANDROID: Publish uncompressed Image on aarch64 FROMLIST: crypto: arm64/poly1305-neon - reorder PAC authentication with SP update UPSTREAM: crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian UPSTREAM: crypto: arm64/chacha - fix hchacha_block_neon() for big endian Linux 4.19.154 usb: gadget: f_ncm: allow using NCM in SuperSpeed Plus gadgets. eeprom: at25: set minimum read/write access stride to 1 USB: cdc-wdm: Make wdm_flush() interruptible and add wdm_fsync(). usb: cdc-acm: add quirk to blacklist ETAS ES58X devices tty: serial: fsl_lpuart: fix lpuart32_poll_get_char net: korina: cast KSEG0 address to pointer in kfree ath10k: check idx validity in __ath10k_htt_rx_ring_fill_n() scsi: ufs: ufs-qcom: Fix race conditions caused by ufs_qcom_testbus_config() ... Change-Id: I797efa1149f557c1dfab7856813cc40d1a4d60b2 Conflicts: drivers/net/ethernet/stmicro/stmmac/stmmac_main.c mm/page_alloc.c
This commit is contained in:
@@ -562,7 +562,7 @@
|
||||
loops can be debugged more effectively on production
|
||||
systems.
|
||||
|
||||
clearcpuid=BITNUM [X86]
|
||||
clearcpuid=BITNUM[,BITNUM...] [X86]
|
||||
Disable CPUID feature X for the kernel. See
|
||||
arch/x86/include/asm/cpufeatures.h for the valid bit
|
||||
numbers. Note the Linux specific bits are not necessarily
|
||||
|
||||
@@ -99,16 +99,20 @@ Coarse and fast_ns access
|
||||
|
||||
Some additional variants exist for more specialized cases:
|
||||
|
||||
.. c:function:: ktime_t ktime_get_coarse_boottime( void )
|
||||
.. c:function:: ktime_t ktime_get_coarse( void )
|
||||
ktime_t ktime_get_coarse_boottime( void )
|
||||
ktime_t ktime_get_coarse_real( void )
|
||||
ktime_t ktime_get_coarse_clocktai( void )
|
||||
ktime_t ktime_get_coarse_raw( void )
|
||||
|
||||
.. c:function:: u64 ktime_get_coarse_ns( void )
|
||||
u64 ktime_get_coarse_boottime_ns( void )
|
||||
u64 ktime_get_coarse_real_ns( void )
|
||||
u64 ktime_get_coarse_clocktai_ns( void )
|
||||
|
||||
.. c:function:: void ktime_get_coarse_ts64( struct timespec64 * )
|
||||
void ktime_get_coarse_boottime_ts64( struct timespec64 * )
|
||||
void ktime_get_coarse_real_ts64( struct timespec64 * )
|
||||
void ktime_get_coarse_clocktai_ts64( struct timespec64 * )
|
||||
void ktime_get_coarse_raw_ts64( struct timespec64 * )
|
||||
|
||||
These are quicker than the non-coarse versions, but less accurate,
|
||||
corresponding to CLOCK_MONONOTNIC_COARSE and CLOCK_REALTIME_COARSE
|
||||
|
||||
@@ -934,12 +934,14 @@ icmp_ratelimit - INTEGER
|
||||
icmp_msgs_per_sec - INTEGER
|
||||
Limit maximal number of ICMP packets sent per second from this host.
|
||||
Only messages whose type matches icmp_ratemask (see below) are
|
||||
controlled by this limit.
|
||||
controlled by this limit. For security reasons, the precise count
|
||||
of messages per second is randomized.
|
||||
Default: 1000
|
||||
|
||||
icmp_msgs_burst - INTEGER
|
||||
icmp_msgs_per_sec controls number of ICMP packets sent per second,
|
||||
while icmp_msgs_burst controls the burst size of these packets.
|
||||
For security reasons, the precise burst size is randomized.
|
||||
Default: 50
|
||||
|
||||
icmp_ratemask - INTEGER
|
||||
|
||||
@@ -3901,6 +3901,7 @@ F: crypto/
|
||||
F: drivers/crypto/
|
||||
F: include/crypto/
|
||||
F: include/linux/crypto*
|
||||
F: lib/crypto/
|
||||
|
||||
CRYPTOGRAPHIC RANDOM NUMBER GENERATOR
|
||||
M: Neil Horman <nhorman@tuxdriver.com>
|
||||
@@ -15882,6 +15883,14 @@ L: linux-gpio@vger.kernel.org
|
||||
S: Maintained
|
||||
F: drivers/gpio/gpio-ws16c48.c
|
||||
|
||||
WIREGUARD SECURE NETWORK TUNNEL
|
||||
M: Jason A. Donenfeld <Jason@zx2c4.com>
|
||||
S: Maintained
|
||||
F: drivers/net/wireguard/
|
||||
F: tools/testing/selftests/wireguard/
|
||||
L: wireguard@lists.zx2c4.com
|
||||
L: netdev@vger.kernel.org
|
||||
|
||||
WISTRON LAPTOP BUTTON DRIVER
|
||||
M: Miloslav Trmac <mitr@volny.cz>
|
||||
S: Maintained
|
||||
|
||||
2
Makefile
2
Makefile
@@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
VERSION = 4
|
||||
PATCHLEVEL = 19
|
||||
SUBLEVEL = 149
|
||||
SUBLEVEL = 154
|
||||
EXTRAVERSION =
|
||||
NAME = "People's Front"
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2348,6 +2348,7 @@
|
||||
__sock_recv_ts_and_drops
|
||||
sock_wake_async
|
||||
sock_wfree
|
||||
timer_reduce
|
||||
unregister_net_sysctl_table
|
||||
__wake_up_sync_key
|
||||
__xfrm_policy_check
|
||||
|
||||
@@ -11,5 +11,6 @@ menuconfig ARC_SOC_HSDK
|
||||
select ARC_HAS_ACCL_REGS
|
||||
select ARC_IRQ_NO_AUTOSAVE
|
||||
select CLK_HSDK
|
||||
select RESET_CONTROLLER
|
||||
select RESET_HSDK
|
||||
select MIGHT_HAVE_PCI
|
||||
|
||||
@@ -120,7 +120,7 @@ ccflags-y := -fpic $(call cc-option,-mno-single-pic-base,) -fno-builtin -I$(obj)
|
||||
asflags-y := -DZIMAGE
|
||||
|
||||
# Supply kernel BSS size to the decompressor via a linker symbol.
|
||||
KBSS_SZ = $(shell echo $$(($$($(CROSS_COMPILE)nm $(obj)/../../../../vmlinux | \
|
||||
KBSS_SZ = $(shell echo $$(($$($(NM) $(obj)/../../../../vmlinux | \
|
||||
sed -n -e 's/^\([^ ]*\) [AB] __bss_start$$/-0x\1/p' \
|
||||
-e 's/^\([^ ]*\) [AB] __bss_stop$$/+0x\1/p') )) )
|
||||
LDFLAGS_vmlinux = --defsym _kernel_bss_size=$(KBSS_SZ)
|
||||
@@ -166,7 +166,7 @@ $(obj)/bswapsdi2.S: $(srctree)/arch/$(SRCARCH)/lib/bswapsdi2.S
|
||||
# The .data section is already discarded by the linker script so no need
|
||||
# to bother about it here.
|
||||
check_for_bad_syms = \
|
||||
bad_syms=$$($(CROSS_COMPILE)nm $@ | sed -n 's/^.\{8\} [bc] \(.*\)/\1/p') && \
|
||||
bad_syms=$$($(NM) $@ | sed -n 's/^.\{8\} [bc] \(.*\)/\1/p') && \
|
||||
[ -z "$$bad_syms" ] || \
|
||||
( echo "following symbols must have non local/private scope:" >&2; \
|
||||
echo "$$bad_syms" >&2; rm -f $@; false )
|
||||
|
||||
@@ -922,8 +922,10 @@
|
||||
};
|
||||
|
||||
rngb: rngb@21b4000 {
|
||||
compatible = "fsl,imx6sl-rngb", "fsl,imx25-rngb";
|
||||
reg = <0x021b4000 0x4000>;
|
||||
interrupts = <0 5 IRQ_TYPE_LEVEL_HIGH>;
|
||||
clocks = <&clks IMX6SL_CLK_DUMMY>;
|
||||
};
|
||||
|
||||
weim: weim@21b8000 {
|
||||
|
||||
@@ -85,21 +85,21 @@
|
||||
global_timer: timer@b0020200 {
|
||||
compatible = "arm,cortex-a9-global-timer";
|
||||
reg = <0xb0020200 0x100>;
|
||||
interrupts = <GIC_PPI 0 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||
interrupts = <GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
||||
twd_timer: timer@b0020600 {
|
||||
compatible = "arm,cortex-a9-twd-timer";
|
||||
reg = <0xb0020600 0x20>;
|
||||
interrupts = <GIC_PPI 2 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||
interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
||||
twd_wdt: wdt@b0020620 {
|
||||
compatible = "arm,cortex-a9-twd-wdt";
|
||||
reg = <0xb0020620 0xe0>;
|
||||
interrupts = <GIC_PPI 3 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||
interrupts = <GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
||||
|
||||
@@ -206,16 +206,16 @@
|
||||
};
|
||||
|
||||
®_dc1sw {
|
||||
regulator-min-microvolt = <3000000>;
|
||||
regulator-max-microvolt = <3000000>;
|
||||
regulator-min-microvolt = <3300000>;
|
||||
regulator-max-microvolt = <3300000>;
|
||||
regulator-name = "vcc-gmac-phy";
|
||||
};
|
||||
|
||||
®_dcdc1 {
|
||||
regulator-always-on;
|
||||
regulator-min-microvolt = <3000000>;
|
||||
regulator-max-microvolt = <3000000>;
|
||||
regulator-name = "vcc-3v0";
|
||||
regulator-min-microvolt = <3300000>;
|
||||
regulator-max-microvolt = <3300000>;
|
||||
regulator-name = "vcc-3v3";
|
||||
};
|
||||
|
||||
®_dcdc2 {
|
||||
|
||||
1
arch/arm/crypto/.gitignore
vendored
1
arch/arm/crypto/.gitignore
vendored
@@ -1,3 +1,4 @@
|
||||
aesbs-core.S
|
||||
sha256-core.S
|
||||
sha512-core.S
|
||||
poly1305-core.S
|
||||
|
||||
@@ -125,14 +125,24 @@ config CRYPTO_CRC32_ARM_CE
|
||||
select CRYPTO_HASH
|
||||
|
||||
config CRYPTO_CHACHA20_NEON
|
||||
tristate "NEON accelerated ChaCha stream cipher algorithms"
|
||||
depends on KERNEL_MODE_NEON
|
||||
tristate "NEON and scalar accelerated ChaCha stream cipher algorithms"
|
||||
select CRYPTO_BLKCIPHER
|
||||
select CRYPTO_CHACHA20
|
||||
select CRYPTO_ARCH_HAVE_LIB_CHACHA
|
||||
|
||||
config CRYPTO_POLY1305_ARM
|
||||
tristate "Accelerated scalar and SIMD Poly1305 hash implementations"
|
||||
select CRYPTO_HASH
|
||||
select CRYPTO_ARCH_HAVE_LIB_POLY1305
|
||||
|
||||
config CRYPTO_NHPOLY1305_NEON
|
||||
tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
|
||||
depends on KERNEL_MODE_NEON
|
||||
select CRYPTO_NHPOLY1305
|
||||
|
||||
config CRYPTO_CURVE25519_NEON
|
||||
tristate "NEON accelerated Curve25519 scalar multiplication library"
|
||||
depends on KERNEL_MODE_NEON
|
||||
select CRYPTO_LIB_CURVE25519_GENERIC
|
||||
select CRYPTO_ARCH_HAVE_LIB_CURVE25519
|
||||
|
||||
endif
|
||||
|
||||
@@ -10,7 +10,9 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
|
||||
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
|
||||
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
|
||||
obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
|
||||
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
|
||||
obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
|
||||
|
||||
ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
|
||||
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
|
||||
@@ -53,13 +55,19 @@ aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
|
||||
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
|
||||
crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
|
||||
crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
|
||||
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
|
||||
chacha-neon-y := chacha-scalar-core.o chacha-glue.o
|
||||
chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
|
||||
poly1305-arm-y := poly1305-core.o poly1305-glue.o
|
||||
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
|
||||
curve25519-neon-y := curve25519-core.o curve25519-glue.o
|
||||
|
||||
ifdef REGENERATE_ARM_CRYPTO
|
||||
quiet_cmd_perl = PERL $@
|
||||
cmd_perl = $(PERL) $(<) > $(@)
|
||||
|
||||
$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl
|
||||
$(call cmd,perl)
|
||||
|
||||
$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
|
||||
$(call cmd,perl)
|
||||
|
||||
@@ -67,4 +75,9 @@ $(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl
|
||||
$(call cmd,perl)
|
||||
endif
|
||||
|
||||
targets += sha256-core.S sha512-core.S
|
||||
targets += poly1305-core.S sha256-core.S sha512-core.S
|
||||
|
||||
# massage the perlasm code a bit so we only get the NEON routine if we need it
|
||||
poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
|
||||
poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
|
||||
AFLAGS_poly1305-core.o += $(poly1305-aflags-y)
|
||||
|
||||
356
arch/arm/crypto/chacha-glue.c
Normal file
356
arch/arm/crypto/chacha-glue.c
Normal file
@@ -0,0 +1,356 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* ARM NEON accelerated ChaCha and XChaCha stream ciphers,
|
||||
* including ChaCha20 (RFC7539)
|
||||
*
|
||||
* Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include <asm/cputype.h>
|
||||
#include <asm/hwcap.h>
|
||||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
||||
int nrounds);
|
||||
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
||||
int nrounds);
|
||||
asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
|
||||
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
|
||||
|
||||
asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
|
||||
const u32 *state, int nrounds);
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
|
||||
|
||||
static inline bool neon_usable(void)
|
||||
{
|
||||
return static_branch_likely(&use_neon) && may_use_simd();
|
||||
}
|
||||
|
||||
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes, int nrounds)
|
||||
{
|
||||
u8 buf[CHACHA_BLOCK_SIZE];
|
||||
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha_4block_xor_neon(state, dst, src, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||
src += CHACHA_BLOCK_SIZE * 4;
|
||||
dst += CHACHA_BLOCK_SIZE * 4;
|
||||
state[12] += 4;
|
||||
}
|
||||
while (bytes >= CHACHA_BLOCK_SIZE) {
|
||||
chacha_block_xor_neon(state, dst, src, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE;
|
||||
src += CHACHA_BLOCK_SIZE;
|
||||
dst += CHACHA_BLOCK_SIZE;
|
||||
state[12]++;
|
||||
}
|
||||
if (bytes) {
|
||||
memcpy(buf, src, bytes);
|
||||
chacha_block_xor_neon(state, buf, buf, nrounds);
|
||||
memcpy(dst, buf, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
|
||||
hchacha_block_arm(state, stream, nrounds);
|
||||
} else {
|
||||
kernel_neon_begin();
|
||||
hchacha_block_neon(state, stream, nrounds);
|
||||
kernel_neon_end();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(hchacha_block_arch);
|
||||
|
||||
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
||||
{
|
||||
chacha_init_generic(state, key, iv);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_init_arch);
|
||||
|
||||
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
|
||||
int nrounds)
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
|
||||
bytes <= CHACHA_BLOCK_SIZE) {
|
||||
chacha_doarm(dst, src, bytes, state, nrounds);
|
||||
state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
|
||||
return;
|
||||
}
|
||||
|
||||
do {
|
||||
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
|
||||
|
||||
kernel_neon_begin();
|
||||
chacha_doneon(state, dst, src, todo, nrounds);
|
||||
kernel_neon_end();
|
||||
|
||||
bytes -= todo;
|
||||
src += todo;
|
||||
dst += todo;
|
||||
} while (bytes);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_crypt_arch);
|
||||
|
||||
static int chacha_stream_xor(struct skcipher_request *req,
|
||||
const struct chacha_ctx *ctx, const u8 *iv,
|
||||
bool neon)
|
||||
{
|
||||
struct skcipher_walk walk;
|
||||
u32 state[16];
|
||||
int err;
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
chacha_init_generic(state, ctx->key, iv);
|
||||
|
||||
while (walk.nbytes > 0) {
|
||||
unsigned int nbytes = walk.nbytes;
|
||||
|
||||
if (nbytes < walk.total)
|
||||
nbytes = round_down(nbytes, walk.stride);
|
||||
|
||||
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
|
||||
chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
|
||||
nbytes, state, ctx->nrounds);
|
||||
state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
|
||||
} else {
|
||||
kernel_neon_begin();
|
||||
chacha_doneon(state, walk.dst.virt.addr,
|
||||
walk.src.virt.addr, nbytes, ctx->nrounds);
|
||||
kernel_neon_end();
|
||||
}
|
||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int do_chacha(struct skcipher_request *req, bool neon)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
|
||||
return chacha_stream_xor(req, ctx, req->iv, neon);
|
||||
}
|
||||
|
||||
static int chacha_arm(struct skcipher_request *req)
|
||||
{
|
||||
return do_chacha(req, false);
|
||||
}
|
||||
|
||||
static int chacha_neon(struct skcipher_request *req)
|
||||
{
|
||||
return do_chacha(req, neon_usable());
|
||||
}
|
||||
|
||||
static int do_xchacha(struct skcipher_request *req, bool neon)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
struct chacha_ctx subctx;
|
||||
u32 state[16];
|
||||
u8 real_iv[16];
|
||||
|
||||
chacha_init_generic(state, ctx->key, req->iv);
|
||||
|
||||
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
|
||||
hchacha_block_arm(state, subctx.key, ctx->nrounds);
|
||||
} else {
|
||||
kernel_neon_begin();
|
||||
hchacha_block_neon(state, subctx.key, ctx->nrounds);
|
||||
kernel_neon_end();
|
||||
}
|
||||
subctx.nrounds = ctx->nrounds;
|
||||
|
||||
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||
memcpy(&real_iv[8], req->iv + 16, 8);
|
||||
return chacha_stream_xor(req, &subctx, real_iv, neon);
|
||||
}
|
||||
|
||||
static int xchacha_arm(struct skcipher_request *req)
|
||||
{
|
||||
return do_xchacha(req, false);
|
||||
}
|
||||
|
||||
static int xchacha_neon(struct skcipher_request *req)
|
||||
{
|
||||
return do_xchacha(req, neon_usable());
|
||||
}
|
||||
|
||||
static struct skcipher_alg arm_algs[] = {
|
||||
{
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-arm",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = chacha_arm,
|
||||
.decrypt = chacha_arm,
|
||||
}, {
|
||||
.base.cra_name = "xchacha20",
|
||||
.base.cra_driver_name = "xchacha20-arm",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = xchacha_arm,
|
||||
.decrypt = xchacha_arm,
|
||||
}, {
|
||||
.base.cra_name = "xchacha12",
|
||||
.base.cra_driver_name = "xchacha12-arm",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha12_setkey,
|
||||
.encrypt = xchacha_arm,
|
||||
.decrypt = xchacha_arm,
|
||||
},
|
||||
};
|
||||
|
||||
static struct skcipher_alg neon_algs[] = {
|
||||
{
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-neon",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = chacha_neon,
|
||||
.decrypt = chacha_neon,
|
||||
}, {
|
||||
.base.cra_name = "xchacha20",
|
||||
.base.cra_driver_name = "xchacha20-neon",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = xchacha_neon,
|
||||
.decrypt = xchacha_neon,
|
||||
}, {
|
||||
.base.cra_name = "xchacha12",
|
||||
.base.cra_driver_name = "xchacha12-neon",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha12_setkey,
|
||||
.encrypt = xchacha_neon,
|
||||
.decrypt = xchacha_neon,
|
||||
}
|
||||
};
|
||||
|
||||
static int __init chacha_simd_mod_init(void)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
|
||||
err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
|
||||
int i;
|
||||
|
||||
switch (read_cpuid_part()) {
|
||||
case ARM_CPU_PART_CORTEX_A7:
|
||||
case ARM_CPU_PART_CORTEX_A5:
|
||||
/*
|
||||
* The Cortex-A7 and Cortex-A5 do not perform well with
|
||||
* the NEON implementation but do incredibly with the
|
||||
* scalar one and use less power.
|
||||
*/
|
||||
for (i = 0; i < ARRAY_SIZE(neon_algs); i++)
|
||||
neon_algs[i].base.cra_priority = 0;
|
||||
break;
|
||||
default:
|
||||
static_branch_enable(&use_neon);
|
||||
}
|
||||
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
|
||||
err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
|
||||
if (err)
|
||||
crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static void __exit chacha_simd_mod_fini(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
|
||||
crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
|
||||
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
|
||||
crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
|
||||
}
|
||||
}
|
||||
|
||||
module_init(chacha_simd_mod_init);
|
||||
module_exit(chacha_simd_mod_fini);
|
||||
|
||||
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)");
|
||||
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_ALIAS_CRYPTO("chacha20");
|
||||
MODULE_ALIAS_CRYPTO("chacha20-arm");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20-arm");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12-arm");
|
||||
#ifdef CONFIG_KERNEL_MODE_NEON
|
||||
MODULE_ALIAS_CRYPTO("chacha20-neon");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20-neon");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12-neon");
|
||||
#endif
|
||||
460
arch/arm/crypto/chacha-scalar-core.S
Normal file
460
arch/arm/crypto/chacha-scalar-core.S
Normal file
@@ -0,0 +1,460 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (C) 2018 Google, Inc.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/assembler.h>
|
||||
|
||||
/*
|
||||
* Design notes:
|
||||
*
|
||||
* 16 registers would be needed to hold the state matrix, but only 14 are
|
||||
* available because 'sp' and 'pc' cannot be used. So we spill the elements
|
||||
* (x8, x9) to the stack and swap them out with (x10, x11). This adds one
|
||||
* 'ldrd' and one 'strd' instruction per round.
|
||||
*
|
||||
* All rotates are performed using the implicit rotate operand accepted by the
|
||||
* 'add' and 'eor' instructions. This is faster than using explicit rotate
|
||||
* instructions. To make this work, we allow the values in the second and last
|
||||
* rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
|
||||
* wrong rotation amount. The rotation amount is then fixed up just in time
|
||||
* when the values are used. 'brot' is the number of bits the values in row 'b'
|
||||
* need to be rotated right to arrive at the correct values, and 'drot'
|
||||
* similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
|
||||
* that they end up as (25, 24) after every round.
|
||||
*/
|
||||
|
||||
// ChaCha state registers
|
||||
X0 .req r0
|
||||
X1 .req r1
|
||||
X2 .req r2
|
||||
X3 .req r3
|
||||
X4 .req r4
|
||||
X5 .req r5
|
||||
X6 .req r6
|
||||
X7 .req r7
|
||||
X8_X10 .req r8 // shared by x8 and x10
|
||||
X9_X11 .req r9 // shared by x9 and x11
|
||||
X12 .req r10
|
||||
X13 .req r11
|
||||
X14 .req r12
|
||||
X15 .req r14
|
||||
|
||||
.macro __rev out, in, t0, t1, t2
|
||||
.if __LINUX_ARM_ARCH__ >= 6
|
||||
rev \out, \in
|
||||
.else
|
||||
lsl \t0, \in, #24
|
||||
and \t1, \in, #0xff00
|
||||
and \t2, \in, #0xff0000
|
||||
orr \out, \t0, \in, lsr #24
|
||||
orr \out, \out, \t1, lsl #8
|
||||
orr \out, \out, \t2, lsr #8
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro _le32_bswap x, t0, t1, t2
|
||||
#ifdef __ARMEB__
|
||||
__rev \x, \x, \t0, \t1, \t2
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
|
||||
_le32_bswap \a, \t0, \t1, \t2
|
||||
_le32_bswap \b, \t0, \t1, \t2
|
||||
_le32_bswap \c, \t0, \t1, \t2
|
||||
_le32_bswap \d, \t0, \t1, \t2
|
||||
.endm
|
||||
|
||||
.macro __ldrd a, b, src, offset
|
||||
#if __LINUX_ARM_ARCH__ >= 6
|
||||
ldrd \a, \b, [\src, #\offset]
|
||||
#else
|
||||
ldr \a, [\src, #\offset]
|
||||
ldr \b, [\src, #\offset + 4]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro __strd a, b, dst, offset
|
||||
#if __LINUX_ARM_ARCH__ >= 6
|
||||
strd \a, \b, [\dst, #\offset]
|
||||
#else
|
||||
str \a, [\dst, #\offset]
|
||||
str \b, [\dst, #\offset + 4]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
|
||||
|
||||
// a += b; d ^= a; d = rol(d, 16);
|
||||
add \a1, \a1, \b1, ror #brot
|
||||
add \a2, \a2, \b2, ror #brot
|
||||
eor \d1, \a1, \d1, ror #drot
|
||||
eor \d2, \a2, \d2, ror #drot
|
||||
// drot == 32 - 16 == 16
|
||||
|
||||
// c += d; b ^= c; b = rol(b, 12);
|
||||
add \c1, \c1, \d1, ror #16
|
||||
add \c2, \c2, \d2, ror #16
|
||||
eor \b1, \c1, \b1, ror #brot
|
||||
eor \b2, \c2, \b2, ror #brot
|
||||
// brot == 32 - 12 == 20
|
||||
|
||||
// a += b; d ^= a; d = rol(d, 8);
|
||||
add \a1, \a1, \b1, ror #20
|
||||
add \a2, \a2, \b2, ror #20
|
||||
eor \d1, \a1, \d1, ror #16
|
||||
eor \d2, \a2, \d2, ror #16
|
||||
// drot == 32 - 8 == 24
|
||||
|
||||
// c += d; b ^= c; b = rol(b, 7);
|
||||
add \c1, \c1, \d1, ror #24
|
||||
add \c2, \c2, \d2, ror #24
|
||||
eor \b1, \c1, \b1, ror #20
|
||||
eor \b2, \c2, \b2, ror #20
|
||||
// brot == 32 - 7 == 25
|
||||
.endm
|
||||
|
||||
.macro _doubleround
|
||||
|
||||
// column round
|
||||
|
||||
// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
|
||||
_halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
|
||||
|
||||
// save (x8, x9); restore (x10, x11)
|
||||
__strd X8_X10, X9_X11, sp, 0
|
||||
__ldrd X8_X10, X9_X11, sp, 8
|
||||
|
||||
// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
|
||||
_halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
|
||||
|
||||
.set brot, 25
|
||||
.set drot, 24
|
||||
|
||||
// diagonal round
|
||||
|
||||
// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
|
||||
_halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
|
||||
|
||||
// save (x10, x11); restore (x8, x9)
|
||||
__strd X8_X10, X9_X11, sp, 8
|
||||
__ldrd X8_X10, X9_X11, sp, 0
|
||||
|
||||
// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
|
||||
_halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
|
||||
.endm
|
||||
|
||||
.macro _chacha_permute nrounds
|
||||
.set brot, 0
|
||||
.set drot, 0
|
||||
.rept \nrounds / 2
|
||||
_doubleround
|
||||
.endr
|
||||
.endm
|
||||
|
||||
.macro _chacha nrounds
|
||||
|
||||
.Lnext_block\@:
|
||||
// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
|
||||
// Registers contain x0-x9,x12-x15.
|
||||
|
||||
// Do the core ChaCha permutation to update x0-x15.
|
||||
_chacha_permute \nrounds
|
||||
|
||||
add sp, #8
|
||||
// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
|
||||
// Registers contain x0-x9,x12-x15.
|
||||
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
|
||||
|
||||
// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
|
||||
push {X8_X10, X9_X11, X12, X13, X14, X15}
|
||||
|
||||
// Load (OUT, IN, LEN).
|
||||
ldr r14, [sp, #96]
|
||||
ldr r12, [sp, #100]
|
||||
ldr r11, [sp, #104]
|
||||
|
||||
orr r10, r14, r12
|
||||
|
||||
// Use slow path if fewer than 64 bytes remain.
|
||||
cmp r11, #64
|
||||
blt .Lxor_slowpath\@
|
||||
|
||||
// Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
|
||||
// ARMv6+, since ldmia and stmia (used below) still require alignment.
|
||||
tst r10, #3
|
||||
bne .Lxor_slowpath\@
|
||||
|
||||
// Fast path: XOR 64 bytes of aligned data.
|
||||
|
||||
// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
|
||||
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
|
||||
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
|
||||
|
||||
// x0-x3
|
||||
__ldrd r8, r9, sp, 32
|
||||
__ldrd r10, r11, sp, 40
|
||||
add X0, X0, r8
|
||||
add X1, X1, r9
|
||||
add X2, X2, r10
|
||||
add X3, X3, r11
|
||||
_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
|
||||
ldmia r12!, {r8-r11}
|
||||
eor X0, X0, r8
|
||||
eor X1, X1, r9
|
||||
eor X2, X2, r10
|
||||
eor X3, X3, r11
|
||||
stmia r14!, {X0-X3}
|
||||
|
||||
// x4-x7
|
||||
__ldrd r8, r9, sp, 48
|
||||
__ldrd r10, r11, sp, 56
|
||||
add X4, r8, X4, ror #brot
|
||||
add X5, r9, X5, ror #brot
|
||||
ldmia r12!, {X0-X3}
|
||||
add X6, r10, X6, ror #brot
|
||||
add X7, r11, X7, ror #brot
|
||||
_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
|
||||
eor X4, X4, X0
|
||||
eor X5, X5, X1
|
||||
eor X6, X6, X2
|
||||
eor X7, X7, X3
|
||||
stmia r14!, {X4-X7}
|
||||
|
||||
// x8-x15
|
||||
pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
|
||||
__ldrd r8, r9, sp, 32
|
||||
__ldrd r10, r11, sp, 40
|
||||
add r0, r0, r8 // x8
|
||||
add r1, r1, r9 // x9
|
||||
add r6, r6, r10 // x10
|
||||
add r7, r7, r11 // x11
|
||||
_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
|
||||
ldmia r12!, {r8-r11}
|
||||
eor r0, r0, r8 // x8
|
||||
eor r1, r1, r9 // x9
|
||||
eor r6, r6, r10 // x10
|
||||
eor r7, r7, r11 // x11
|
||||
stmia r14!, {r0,r1,r6,r7}
|
||||
ldmia r12!, {r0,r1,r6,r7}
|
||||
__ldrd r8, r9, sp, 48
|
||||
__ldrd r10, r11, sp, 56
|
||||
add r2, r8, r2, ror #drot // x12
|
||||
add r3, r9, r3, ror #drot // x13
|
||||
add r4, r10, r4, ror #drot // x14
|
||||
add r5, r11, r5, ror #drot // x15
|
||||
_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
|
||||
ldr r9, [sp, #72] // load LEN
|
||||
eor r2, r2, r0 // x12
|
||||
eor r3, r3, r1 // x13
|
||||
eor r4, r4, r6 // x14
|
||||
eor r5, r5, r7 // x15
|
||||
subs r9, #64 // decrement and check LEN
|
||||
stmia r14!, {r2-r5}
|
||||
|
||||
beq .Ldone\@
|
||||
|
||||
.Lprepare_for_next_block\@:
|
||||
|
||||
// Stack: x0-x15 OUT IN LEN
|
||||
|
||||
// Increment block counter (x12)
|
||||
add r8, #1
|
||||
|
||||
// Store updated (OUT, IN, LEN)
|
||||
str r14, [sp, #64]
|
||||
str r12, [sp, #68]
|
||||
str r9, [sp, #72]
|
||||
|
||||
mov r14, sp
|
||||
|
||||
// Store updated block counter (x12)
|
||||
str r8, [sp, #48]
|
||||
|
||||
sub sp, #16
|
||||
|
||||
// Reload state and do next block
|
||||
ldmia r14!, {r0-r11} // load x0-x11
|
||||
__strd r10, r11, sp, 8 // store x10-x11 before state
|
||||
ldmia r14, {r10-r12,r14} // load x12-x15
|
||||
b .Lnext_block\@
|
||||
|
||||
.Lxor_slowpath\@:
|
||||
// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
|
||||
// We handle it by storing the 64 bytes of keystream to the stack, then
|
||||
// XOR-ing the needed portion with the data.
|
||||
|
||||
// Allocate keystream buffer
|
||||
sub sp, #64
|
||||
mov r14, sp
|
||||
|
||||
// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
|
||||
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
|
||||
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
|
||||
|
||||
// Save keystream for x0-x3
|
||||
__ldrd r8, r9, sp, 96
|
||||
__ldrd r10, r11, sp, 104
|
||||
add X0, X0, r8
|
||||
add X1, X1, r9
|
||||
add X2, X2, r10
|
||||
add X3, X3, r11
|
||||
_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
|
||||
stmia r14!, {X0-X3}
|
||||
|
||||
// Save keystream for x4-x7
|
||||
__ldrd r8, r9, sp, 112
|
||||
__ldrd r10, r11, sp, 120
|
||||
add X4, r8, X4, ror #brot
|
||||
add X5, r9, X5, ror #brot
|
||||
add X6, r10, X6, ror #brot
|
||||
add X7, r11, X7, ror #brot
|
||||
_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
|
||||
add r8, sp, #64
|
||||
stmia r14!, {X4-X7}
|
||||
|
||||
// Save keystream for x8-x15
|
||||
ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
|
||||
__ldrd r8, r9, sp, 128
|
||||
__ldrd r10, r11, sp, 136
|
||||
add r0, r0, r8 // x8
|
||||
add r1, r1, r9 // x9
|
||||
add r6, r6, r10 // x10
|
||||
add r7, r7, r11 // x11
|
||||
_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
|
||||
stmia r14!, {r0,r1,r6,r7}
|
||||
__ldrd r8, r9, sp, 144
|
||||
__ldrd r10, r11, sp, 152
|
||||
add r2, r8, r2, ror #drot // x12
|
||||
add r3, r9, r3, ror #drot // x13
|
||||
add r4, r10, r4, ror #drot // x14
|
||||
add r5, r11, r5, ror #drot // x15
|
||||
_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
|
||||
stmia r14, {r2-r5}
|
||||
|
||||
// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
|
||||
// Registers: r8 is block counter, r12 is IN.
|
||||
|
||||
ldr r9, [sp, #168] // LEN
|
||||
ldr r14, [sp, #160] // OUT
|
||||
cmp r9, #64
|
||||
mov r0, sp
|
||||
movle r1, r9
|
||||
movgt r1, #64
|
||||
// r1 is number of bytes to XOR, in range [1, 64]
|
||||
|
||||
.if __LINUX_ARM_ARCH__ < 6
|
||||
orr r2, r12, r14
|
||||
tst r2, #3 // IN or OUT misaligned?
|
||||
bne .Lxor_next_byte\@
|
||||
.endif
|
||||
|
||||
// XOR a word at a time
|
||||
.rept 16
|
||||
subs r1, #4
|
||||
blt .Lxor_words_done\@
|
||||
ldr r2, [r12], #4
|
||||
ldr r3, [r0], #4
|
||||
eor r2, r2, r3
|
||||
str r2, [r14], #4
|
||||
.endr
|
||||
b .Lxor_slowpath_done\@
|
||||
.Lxor_words_done\@:
|
||||
ands r1, r1, #3
|
||||
beq .Lxor_slowpath_done\@
|
||||
|
||||
// XOR a byte at a time
|
||||
.Lxor_next_byte\@:
|
||||
ldrb r2, [r12], #1
|
||||
ldrb r3, [r0], #1
|
||||
eor r2, r2, r3
|
||||
strb r2, [r14], #1
|
||||
subs r1, #1
|
||||
bne .Lxor_next_byte\@
|
||||
|
||||
.Lxor_slowpath_done\@:
|
||||
subs r9, #64
|
||||
add sp, #96
|
||||
bgt .Lprepare_for_next_block\@
|
||||
|
||||
.Ldone\@:
|
||||
.endm // _chacha
|
||||
|
||||
/*
|
||||
* void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
|
||||
* const u32 *state, int nrounds);
|
||||
*/
|
||||
ENTRY(chacha_doarm)
|
||||
cmp r2, #0 // len == 0?
|
||||
reteq lr
|
||||
|
||||
ldr ip, [sp]
|
||||
cmp ip, #12
|
||||
|
||||
push {r0-r2,r4-r11,lr}
|
||||
|
||||
// Push state x0-x15 onto stack.
|
||||
// Also store an extra copy of x10-x11 just before the state.
|
||||
|
||||
add X12, r3, #48
|
||||
ldm X12, {X12,X13,X14,X15}
|
||||
push {X12,X13,X14,X15}
|
||||
sub sp, sp, #64
|
||||
|
||||
__ldrd X8_X10, X9_X11, r3, 40
|
||||
__strd X8_X10, X9_X11, sp, 8
|
||||
__strd X8_X10, X9_X11, sp, 56
|
||||
ldm r3, {X0-X9_X11}
|
||||
__strd X0, X1, sp, 16
|
||||
__strd X2, X3, sp, 24
|
||||
__strd X4, X5, sp, 32
|
||||
__strd X6, X7, sp, 40
|
||||
__strd X8_X10, X9_X11, sp, 48
|
||||
|
||||
beq 1f
|
||||
_chacha 20
|
||||
|
||||
0: add sp, #76
|
||||
pop {r4-r11, pc}
|
||||
|
||||
1: _chacha 12
|
||||
b 0b
|
||||
ENDPROC(chacha_doarm)
|
||||
|
||||
/*
|
||||
* void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
|
||||
*/
|
||||
ENTRY(hchacha_block_arm)
|
||||
push {r1,r4-r11,lr}
|
||||
|
||||
cmp r2, #12 // ChaCha12 ?
|
||||
|
||||
mov r14, r0
|
||||
ldmia r14!, {r0-r11} // load x0-x11
|
||||
push {r10-r11} // store x10-x11 to stack
|
||||
ldm r14, {r10-r12,r14} // load x12-x15
|
||||
sub sp, #8
|
||||
|
||||
beq 1f
|
||||
_chacha_permute 20
|
||||
|
||||
// Skip over (unused0-unused1, x10-x11)
|
||||
0: add sp, #16
|
||||
|
||||
// Fix up rotations of x12-x15
|
||||
ror X12, X12, #drot
|
||||
ror X13, X13, #drot
|
||||
pop {r4} // load 'out'
|
||||
ror X14, X14, #drot
|
||||
ror X15, X15, #drot
|
||||
|
||||
// Store (x0-x3,x12-x15) to 'out'
|
||||
stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
|
||||
|
||||
pop {r4-r11,pc}
|
||||
|
||||
1: _chacha_permute 12
|
||||
b 0b
|
||||
ENDPROC(hchacha_block_arm)
|
||||
2062
arch/arm/crypto/curve25519-core.S
Normal file
2062
arch/arm/crypto/curve25519-core.S
Normal file
File diff suppressed because it is too large
Load Diff
135
arch/arm/crypto/curve25519-glue.c
Normal file
135
arch/arm/crypto/curve25519-glue.c
Normal file
@@ -0,0 +1,135 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR MIT
|
||||
/*
|
||||
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||
*
|
||||
* Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
|
||||
* began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
|
||||
* manually reworked for use in kernel space.
|
||||
*/
|
||||
|
||||
#include <asm/hwcap.h>
|
||||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
#include <crypto/internal/kpp.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <crypto/curve25519.h>
|
||||
|
||||
asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
|
||||
const u8 secret[CURVE25519_KEY_SIZE],
|
||||
const u8 basepoint[CURVE25519_KEY_SIZE]);
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
|
||||
|
||||
void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
|
||||
const u8 scalar[CURVE25519_KEY_SIZE],
|
||||
const u8 point[CURVE25519_KEY_SIZE])
|
||||
{
|
||||
if (static_branch_likely(&have_neon) && may_use_simd()) {
|
||||
kernel_neon_begin();
|
||||
curve25519_neon(out, scalar, point);
|
||||
kernel_neon_end();
|
||||
} else {
|
||||
curve25519_generic(out, scalar, point);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(curve25519_arch);
|
||||
|
||||
void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
|
||||
const u8 secret[CURVE25519_KEY_SIZE])
|
||||
{
|
||||
return curve25519_arch(pub, secret, curve25519_base_point);
|
||||
}
|
||||
EXPORT_SYMBOL(curve25519_base_arch);
|
||||
|
||||
static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
|
||||
unsigned int len)
|
||||
{
|
||||
u8 *secret = kpp_tfm_ctx(tfm);
|
||||
|
||||
if (!len)
|
||||
curve25519_generate_secret(secret);
|
||||
else if (len == CURVE25519_KEY_SIZE &&
|
||||
crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
|
||||
memcpy(secret, buf, CURVE25519_KEY_SIZE);
|
||||
else
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int curve25519_compute_value(struct kpp_request *req)
|
||||
{
|
||||
struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
|
||||
const u8 *secret = kpp_tfm_ctx(tfm);
|
||||
u8 public_key[CURVE25519_KEY_SIZE];
|
||||
u8 buf[CURVE25519_KEY_SIZE];
|
||||
int copied, nbytes;
|
||||
u8 const *bp;
|
||||
|
||||
if (req->src) {
|
||||
copied = sg_copy_to_buffer(req->src,
|
||||
sg_nents_for_len(req->src,
|
||||
CURVE25519_KEY_SIZE),
|
||||
public_key, CURVE25519_KEY_SIZE);
|
||||
if (copied != CURVE25519_KEY_SIZE)
|
||||
return -EINVAL;
|
||||
bp = public_key;
|
||||
} else {
|
||||
bp = curve25519_base_point;
|
||||
}
|
||||
|
||||
curve25519_arch(buf, secret, bp);
|
||||
|
||||
/* might want less than we've got */
|
||||
nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
|
||||
copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
|
||||
nbytes),
|
||||
buf, nbytes);
|
||||
if (copied != nbytes)
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
|
||||
{
|
||||
return CURVE25519_KEY_SIZE;
|
||||
}
|
||||
|
||||
static struct kpp_alg curve25519_alg = {
|
||||
.base.cra_name = "curve25519",
|
||||
.base.cra_driver_name = "curve25519-neon",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
.base.cra_ctxsize = CURVE25519_KEY_SIZE,
|
||||
|
||||
.set_secret = curve25519_set_secret,
|
||||
.generate_public_key = curve25519_compute_value,
|
||||
.compute_shared_secret = curve25519_compute_value,
|
||||
.max_size = curve25519_max_size,
|
||||
};
|
||||
|
||||
static int __init mod_init(void)
|
||||
{
|
||||
if (elf_hwcap & HWCAP_NEON) {
|
||||
static_branch_enable(&have_neon);
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
|
||||
crypto_register_kpp(&curve25519_alg) : 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit mod_exit(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && elf_hwcap & HWCAP_NEON)
|
||||
crypto_unregister_kpp(&curve25519_alg);
|
||||
}
|
||||
|
||||
module_init(mod_init);
|
||||
module_exit(mod_exit);
|
||||
|
||||
MODULE_ALIAS_CRYPTO("curve25519");
|
||||
MODULE_ALIAS_CRYPTO("curve25519-neon");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
1236
arch/arm/crypto/poly1305-armv4.pl
Normal file
1236
arch/arm/crypto/poly1305-armv4.pl
Normal file
File diff suppressed because it is too large
Load Diff
1158
arch/arm/crypto/poly1305-core.S_shipped
Normal file
1158
arch/arm/crypto/poly1305-core.S_shipped
Normal file
File diff suppressed because it is too large
Load Diff
272
arch/arm/crypto/poly1305-glue.c
Normal file
272
arch/arm/crypto/poly1305-glue.c
Normal file
@@ -0,0 +1,272 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
|
||||
*
|
||||
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||
*/
|
||||
|
||||
#include <asm/hwcap.h>
|
||||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/internal/poly1305.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
void poly1305_init_arm(void *state, const u8 *key);
|
||||
void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
|
||||
void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
|
||||
void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
|
||||
|
||||
void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
|
||||
{
|
||||
}
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
|
||||
|
||||
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
|
||||
{
|
||||
poly1305_init_arm(&dctx->h, key);
|
||||
dctx->s[0] = get_unaligned_le32(key + 16);
|
||||
dctx->s[1] = get_unaligned_le32(key + 20);
|
||||
dctx->s[2] = get_unaligned_le32(key + 24);
|
||||
dctx->s[3] = get_unaligned_le32(key + 28);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_init_arch);
|
||||
|
||||
static int arm_poly1305_init(struct shash_desc *desc)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
dctx->buflen = 0;
|
||||
dctx->rset = 0;
|
||||
dctx->sset = false;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
u32 len, u32 hibit, bool do_neon)
|
||||
{
|
||||
if (unlikely(!dctx->sset)) {
|
||||
if (!dctx->rset) {
|
||||
poly1305_init_arm(&dctx->h, src);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
dctx->rset = 1;
|
||||
}
|
||||
if (len >= POLY1305_BLOCK_SIZE) {
|
||||
dctx->s[0] = get_unaligned_le32(src + 0);
|
||||
dctx->s[1] = get_unaligned_le32(src + 4);
|
||||
dctx->s[2] = get_unaligned_le32(src + 8);
|
||||
dctx->s[3] = get_unaligned_le32(src + 12);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
dctx->sset = true;
|
||||
}
|
||||
if (len < POLY1305_BLOCK_SIZE)
|
||||
return;
|
||||
}
|
||||
|
||||
len &= ~(POLY1305_BLOCK_SIZE - 1);
|
||||
|
||||
if (static_branch_likely(&have_neon) && likely(do_neon))
|
||||
poly1305_blocks_neon(&dctx->h, src, len, hibit);
|
||||
else
|
||||
poly1305_blocks_arm(&dctx->h, src, len, hibit);
|
||||
}
|
||||
|
||||
static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
|
||||
const u8 *src, u32 len, bool do_neon)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
|
||||
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||
src += bytes;
|
||||
len -= bytes;
|
||||
dctx->buflen += bytes;
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
arm_poly1305_blocks(dctx, dctx->buf,
|
||||
POLY1305_BLOCK_SIZE, 1, false);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(len >= POLY1305_BLOCK_SIZE)) {
|
||||
arm_poly1305_blocks(dctx, src, len, 1, do_neon);
|
||||
src += round_down(len, POLY1305_BLOCK_SIZE);
|
||||
len %= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (unlikely(len)) {
|
||||
dctx->buflen = len;
|
||||
memcpy(dctx->buf, src, len);
|
||||
}
|
||||
}
|
||||
|
||||
static int arm_poly1305_update(struct shash_desc *desc,
|
||||
const u8 *src, unsigned int srclen)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
arm_poly1305_do_update(dctx, src, srclen, false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
|
||||
const u8 *src,
|
||||
unsigned int srclen)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
bool do_neon = may_use_simd() && srclen > 128;
|
||||
|
||||
if (static_branch_likely(&have_neon) && do_neon)
|
||||
kernel_neon_begin();
|
||||
arm_poly1305_do_update(dctx, src, srclen, do_neon);
|
||||
if (static_branch_likely(&have_neon) && do_neon)
|
||||
kernel_neon_end();
|
||||
return 0;
|
||||
}
|
||||
|
||||
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
unsigned int nbytes)
|
||||
{
|
||||
bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
|
||||
may_use_simd();
|
||||
|
||||
if (unlikely(dctx->buflen)) {
|
||||
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
|
||||
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||
src += bytes;
|
||||
nbytes -= bytes;
|
||||
dctx->buflen += bytes;
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
poly1305_blocks_arm(&dctx->h, dctx->buf,
|
||||
POLY1305_BLOCK_SIZE, 1);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
|
||||
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
|
||||
|
||||
if (static_branch_likely(&have_neon) && do_neon) {
|
||||
do {
|
||||
unsigned int todo = min_t(unsigned int, len, SZ_4K);
|
||||
|
||||
kernel_neon_begin();
|
||||
poly1305_blocks_neon(&dctx->h, src, todo, 1);
|
||||
kernel_neon_end();
|
||||
|
||||
len -= todo;
|
||||
src += todo;
|
||||
} while (len);
|
||||
} else {
|
||||
poly1305_blocks_arm(&dctx->h, src, len, 1);
|
||||
src += len;
|
||||
}
|
||||
nbytes %= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (unlikely(nbytes)) {
|
||||
dctx->buflen = nbytes;
|
||||
memcpy(dctx->buf, src, nbytes);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_update_arch);
|
||||
|
||||
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
dctx->buf[dctx->buflen++] = 1;
|
||||
memset(dctx->buf + dctx->buflen, 0,
|
||||
POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
|
||||
}
|
||||
|
||||
poly1305_emit_arm(&dctx->h, dst, dctx->s);
|
||||
*dctx = (struct poly1305_desc_ctx){};
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_final_arch);
|
||||
|
||||
static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
if (unlikely(!dctx->sset))
|
||||
return -ENOKEY;
|
||||
|
||||
poly1305_final_arch(dctx, dst);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg arm_poly1305_algs[] = {{
|
||||
.init = arm_poly1305_init,
|
||||
.update = arm_poly1305_update,
|
||||
.final = arm_poly1305_final,
|
||||
.digestsize = POLY1305_DIGEST_SIZE,
|
||||
.descsize = sizeof(struct poly1305_desc_ctx),
|
||||
|
||||
.base.cra_name = "poly1305",
|
||||
.base.cra_driver_name = "poly1305-arm",
|
||||
.base.cra_priority = 150,
|
||||
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
#ifdef CONFIG_KERNEL_MODE_NEON
|
||||
}, {
|
||||
.init = arm_poly1305_init,
|
||||
.update = arm_poly1305_update_neon,
|
||||
.final = arm_poly1305_final,
|
||||
.digestsize = POLY1305_DIGEST_SIZE,
|
||||
.descsize = sizeof(struct poly1305_desc_ctx),
|
||||
|
||||
.base.cra_name = "poly1305",
|
||||
.base.cra_driver_name = "poly1305-neon",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
#endif
|
||||
}};
|
||||
|
||||
static int __init arm_poly1305_mod_init(void)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
|
||||
(elf_hwcap & HWCAP_NEON))
|
||||
static_branch_enable(&have_neon);
|
||||
else if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
||||
/* register only the first entry */
|
||||
return crypto_register_shash(&arm_poly1305_algs[0]);
|
||||
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||
crypto_register_shashes(arm_poly1305_algs,
|
||||
ARRAY_SIZE(arm_poly1305_algs)) : 0;
|
||||
}
|
||||
|
||||
static void __exit arm_poly1305_mod_exit(void)
|
||||
{
|
||||
if (!IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
||||
return;
|
||||
if (!static_branch_likely(&have_neon)) {
|
||||
crypto_unregister_shash(&arm_poly1305_algs[0]);
|
||||
return;
|
||||
}
|
||||
crypto_unregister_shashes(arm_poly1305_algs,
|
||||
ARRAY_SIZE(arm_poly1305_algs));
|
||||
}
|
||||
|
||||
module_init(arm_poly1305_mod_init);
|
||||
module_exit(arm_poly1305_mod_exit);
|
||||
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_ALIAS_CRYPTO("poly1305");
|
||||
MODULE_ALIAS_CRYPTO("poly1305-arm");
|
||||
MODULE_ALIAS_CRYPTO("poly1305-neon");
|
||||
@@ -1261,20 +1261,28 @@ static void __init l2c310_of_parse(const struct device_node *np,
|
||||
|
||||
ret = of_property_read_u32(np, "prefetch-data", &val);
|
||||
if (ret == 0) {
|
||||
if (val)
|
||||
if (val) {
|
||||
prefetch |= L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||
else
|
||||
*aux_val |= L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||
} else {
|
||||
prefetch &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||
*aux_val &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||
}
|
||||
*aux_mask &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||
} else if (ret != -EINVAL) {
|
||||
pr_err("L2C-310 OF prefetch-data property value is missing\n");
|
||||
}
|
||||
|
||||
ret = of_property_read_u32(np, "prefetch-instr", &val);
|
||||
if (ret == 0) {
|
||||
if (val)
|
||||
if (val) {
|
||||
prefetch |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||
else
|
||||
*aux_val |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||
} else {
|
||||
prefetch &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||
*aux_val &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||
}
|
||||
*aux_mask &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||
} else if (ret != -EINVAL) {
|
||||
pr_err("L2C-310 OF prefetch-instr property value is missing\n");
|
||||
}
|
||||
|
||||
@@ -10,12 +10,13 @@ obj-vdso := $(addprefix $(obj)/, $(obj-vdso))
|
||||
ccflags-y := -fPIC -fno-common -fno-builtin -fno-stack-protector
|
||||
ccflags-y += -DDISABLE_BRANCH_PROFILING
|
||||
|
||||
VDSO_LDFLAGS := -Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1
|
||||
VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
|
||||
VDSO_LDFLAGS += -nostdlib -shared
|
||||
VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
|
||||
VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--build-id)
|
||||
VDSO_LDFLAGS += $(call cc-ldoption, -fuse-ld=bfd)
|
||||
ldflags-$(CONFIG_CPU_ENDIAN_BE8) := --be8
|
||||
ldflags-y := -Bsymbolic --no-undefined -soname=linux-vdso.so.1 \
|
||||
-z max-page-size=4096 -z common-page-size=4096 \
|
||||
-nostdlib -shared $(ldflags-y) \
|
||||
$(call ld-option, --hash-style=sysv) \
|
||||
$(call ld-option, --build-id) \
|
||||
-T
|
||||
|
||||
obj-$(CONFIG_VDSO) += vdso.o
|
||||
extra-$(CONFIG_VDSO) += vdso.lds
|
||||
@@ -37,8 +38,8 @@ KCOV_INSTRUMENT := n
|
||||
$(obj)/vdso.o : $(obj)/vdso.so
|
||||
|
||||
# Link rule for the .so file
|
||||
$(obj)/vdso.so.raw: $(src)/vdso.lds $(obj-vdso) FORCE
|
||||
$(call if_changed,vdsold)
|
||||
$(obj)/vdso.so.raw: $(obj)/vdso.lds $(obj-vdso) FORCE
|
||||
$(call if_changed,ld)
|
||||
|
||||
$(obj)/vdso.so.dbg: $(obj)/vdso.so.raw $(obj)/vdsomunge FORCE
|
||||
$(call if_changed,vdsomunge)
|
||||
@@ -48,11 +49,6 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
|
||||
$(obj)/%.so: $(obj)/%.so.dbg FORCE
|
||||
$(call if_changed,objcopy)
|
||||
|
||||
# Actual build commands
|
||||
quiet_cmd_vdsold = VDSO $@
|
||||
cmd_vdsold = $(CC) $(c_flags) $(VDSO_LDFLAGS) \
|
||||
-Wl,-T $(filter %.lds,$^) $(filter %.o,$^) -o $@
|
||||
|
||||
quiet_cmd_vdsomunge = MUNGE $@
|
||||
cmd_vdsomunge = $(objtree)/$(obj)/vdsomunge $< $@
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#
|
||||
# Copyright (C) 1995-2001 by Russell King
|
||||
|
||||
LDFLAGS_vmlinux :=--no-undefined -X
|
||||
LDFLAGS_vmlinux :=--no-undefined -X -z norelro
|
||||
CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET)
|
||||
GZFLAGS :=-9
|
||||
|
||||
@@ -18,7 +18,7 @@ ifeq ($(CONFIG_RELOCATABLE), y)
|
||||
# Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour
|
||||
# for relative relocs, since this leads to better Image compression
|
||||
# with the relocation offsets always being zero.
|
||||
LDFLAGS_vmlinux += -shared -Bsymbolic -z notext -z norelro \
|
||||
LDFLAGS_vmlinux += -shared -Bsymbolic -z notext \
|
||||
$(call ld-option, --no-apply-dynamic-relocs)
|
||||
endif
|
||||
|
||||
|
||||
@@ -151,6 +151,7 @@
|
||||
};
|
||||
|
||||
&qspi {
|
||||
status = "okay";
|
||||
flash@0 {
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
|
||||
@@ -877,7 +877,7 @@
|
||||
reg-names = "mdp_phys";
|
||||
|
||||
interrupt-parent = <&mdss>;
|
||||
interrupts = <0 0>;
|
||||
interrupts = <0>;
|
||||
|
||||
clocks = <&gcc GCC_MDSS_AHB_CLK>,
|
||||
<&gcc GCC_MDSS_AXI_CLK>,
|
||||
@@ -909,7 +909,7 @@
|
||||
reg-names = "dsi_ctrl";
|
||||
|
||||
interrupt-parent = <&mdss>;
|
||||
interrupts = <4 0>;
|
||||
interrupts = <4>;
|
||||
|
||||
assigned-clocks = <&gcc BYTE0_CLK_SRC>,
|
||||
<&gcc PCLK0_CLK_SRC>;
|
||||
|
||||
@@ -99,7 +99,7 @@
|
||||
|
||||
wcd_codec: codec@f000 {
|
||||
compatible = "qcom,pm8916-wcd-analog-codec";
|
||||
reg = <0xf000 0x200>;
|
||||
reg = <0xf000>;
|
||||
reg-names = "pmic-codec-core";
|
||||
clocks = <&gcc GCC_CODEC_DIGCODEC_CLK>;
|
||||
clock-names = "mclk";
|
||||
|
||||
@@ -411,7 +411,7 @@
|
||||
};
|
||||
|
||||
i2c0: i2c@ff020000 {
|
||||
compatible = "cdns,i2c-r1p14", "cdns,i2c-r1p10";
|
||||
compatible = "cdns,i2c-r1p14";
|
||||
status = "disabled";
|
||||
interrupt-parent = <&gic>;
|
||||
interrupts = <0 17 4>;
|
||||
@@ -421,7 +421,7 @@
|
||||
};
|
||||
|
||||
i2c1: i2c@ff030000 {
|
||||
compatible = "cdns,i2c-r1p14", "cdns,i2c-r1p10";
|
||||
compatible = "cdns,i2c-r1p14";
|
||||
status = "disabled";
|
||||
interrupt-parent = <&gic>;
|
||||
interrupts = <0 18 4>;
|
||||
|
||||
@@ -77,7 +77,6 @@ CONFIG_ARM_SCMI_PROTOCOL=y
|
||||
CONFIG_ARM_SCPI_PROTOCOL=y
|
||||
# CONFIG_ARM_SCPI_POWER_DOMAIN is not set
|
||||
# CONFIG_EFI_ARMSTUB_DTB_LOADER is not set
|
||||
CONFIG_ARM64_CRYPTO=y
|
||||
CONFIG_CRYPTO_SHA2_ARM64_CE=y
|
||||
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
|
||||
CONFIG_JUMP_LABEL=y
|
||||
@@ -246,6 +245,7 @@ CONFIG_DM_VERITY_FEC=y
|
||||
CONFIG_DM_BOW=y
|
||||
CONFIG_NETDEVICES=y
|
||||
CONFIG_DUMMY=y
|
||||
CONFIG_WIREGUARD=y
|
||||
CONFIG_TUN=y
|
||||
CONFIG_VETH=y
|
||||
# CONFIG_ETHERNET is not set
|
||||
@@ -358,6 +358,7 @@ CONFIG_HID_NINTENDO=y
|
||||
CONFIG_HID_SONY=y
|
||||
CONFIG_HID_STEAM=y
|
||||
CONFIG_USB_HIDDEV=y
|
||||
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
|
||||
CONFIG_USB_OTG=y
|
||||
CONFIG_USB_XHCI_HCD=y
|
||||
CONFIG_USB_GADGET=y
|
||||
@@ -502,6 +503,7 @@ CONFIG_CRC8=y
|
||||
CONFIG_XZ_DEC=y
|
||||
CONFIG_PRINTK_TIME=y
|
||||
CONFIG_DEBUG_INFO=y
|
||||
CONFIG_DEBUG_INFO_DWARF4=y
|
||||
# CONFIG_ENABLE_MUST_CHECK is not set
|
||||
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
|
||||
CONFIG_MAGIC_SYSRQ=y
|
||||
|
||||
1
arch/arm64/crypto/.gitignore
vendored
1
arch/arm64/crypto/.gitignore
vendored
@@ -1,2 +1,3 @@
|
||||
sha256-core.S
|
||||
sha512-core.S
|
||||
poly1305-core.S
|
||||
|
||||
@@ -106,10 +106,17 @@ config CRYPTO_AES_ARM64_NEON_BLK
|
||||
select CRYPTO_SIMD
|
||||
|
||||
config CRYPTO_CHACHA20_NEON
|
||||
tristate "NEON accelerated ChaCha20 symmetric cipher"
|
||||
tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
|
||||
depends on KERNEL_MODE_NEON
|
||||
select CRYPTO_BLKCIPHER
|
||||
select CRYPTO_CHACHA20
|
||||
select CRYPTO_LIB_CHACHA_GENERIC
|
||||
select CRYPTO_ARCH_HAVE_LIB_CHACHA
|
||||
|
||||
config CRYPTO_POLY1305_NEON
|
||||
tristate "Poly1305 hash function using scalar or NEON instructions"
|
||||
depends on KERNEL_MODE_NEON
|
||||
select CRYPTO_HASH
|
||||
select CRYPTO_ARCH_HAVE_LIB_POLY1305
|
||||
|
||||
config CRYPTO_AES_ARM64_BS
|
||||
tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
|
||||
|
||||
@@ -53,8 +53,12 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
|
||||
obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
|
||||
sha512-arm64-y := sha512-glue.o sha512-core.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
|
||||
chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
|
||||
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
|
||||
poly1305-neon-y := poly1305-core.o poly1305-glue.o
|
||||
AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
|
||||
|
||||
obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
|
||||
aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
|
||||
@@ -71,6 +75,9 @@ ifdef REGENERATE_ARM64_CRYPTO
|
||||
quiet_cmd_perlasm = PERLASM $@
|
||||
cmd_perlasm = $(PERL) $(<) void $(@)
|
||||
|
||||
$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
|
||||
$(call cmd,perlasm)
|
||||
|
||||
$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
|
||||
$(call cmd,perlasm)
|
||||
|
||||
@@ -78,4 +85,4 @@ $(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
|
||||
$(call cmd,perlasm)
|
||||
endif
|
||||
|
||||
targets += sha256-core.S sha512-core.S
|
||||
targets += poly1305-core.S sha256-core.S sha512-core.S
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
|
||||
* ChaCha/XChaCha NEON helper functions
|
||||
*
|
||||
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
* Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Based on:
|
||||
* Originally based on:
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
@@ -19,29 +19,27 @@
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/assembler.h>
|
||||
#include <asm/cache.h>
|
||||
|
||||
.text
|
||||
.align 6
|
||||
|
||||
ENTRY(chacha20_block_xor_neon)
|
||||
// x0: Input state matrix, s
|
||||
// x1: 1 data block output, o
|
||||
// x2: 1 data block input, i
|
||||
/*
|
||||
* chacha_permute - permute one block
|
||||
*
|
||||
* Permute one 64-byte block where the state matrix is stored in the four NEON
|
||||
* registers v0-v3. It performs matrix operations on four words in parallel,
|
||||
* but requires shuffling to rearrange the words after each round.
|
||||
*
|
||||
* The round count is given in w3.
|
||||
*
|
||||
* Clobbers: w3, x10, v4, v12
|
||||
*/
|
||||
chacha_permute:
|
||||
|
||||
//
|
||||
// This function encrypts one ChaCha20 block by loading the state matrix
|
||||
// in four NEON registers. It performs matrix operation on four words in
|
||||
// parallel, but requires shuffling to rearrange the words after each
|
||||
// round.
|
||||
//
|
||||
|
||||
// x0..3 = s0..3
|
||||
adr x3, ROT8
|
||||
ld1 {v0.4s-v3.4s}, [x0]
|
||||
ld1 {v8.4s-v11.4s}, [x0]
|
||||
ld1 {v12.4s}, [x3]
|
||||
|
||||
mov x3, #10
|
||||
adr_l x10, ROT8
|
||||
ld1 {v12.4s}, [x10]
|
||||
|
||||
.Ldoubleround:
|
||||
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
@@ -102,9 +100,27 @@ ENTRY(chacha20_block_xor_neon)
|
||||
// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||
ext v3.16b, v3.16b, v3.16b, #4
|
||||
|
||||
subs x3, x3, #1
|
||||
subs w3, w3, #2
|
||||
b.ne .Ldoubleround
|
||||
|
||||
ret
|
||||
ENDPROC(chacha_permute)
|
||||
|
||||
ENTRY(chacha_block_xor_neon)
|
||||
// x0: Input state matrix, s
|
||||
// x1: 1 data block output, o
|
||||
// x2: 1 data block input, i
|
||||
// w3: nrounds
|
||||
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
|
||||
// x0..3 = s0..3
|
||||
ld1 {v0.4s-v3.4s}, [x0]
|
||||
ld1 {v8.4s-v11.4s}, [x0]
|
||||
|
||||
bl chacha_permute
|
||||
|
||||
ld1 {v4.16b-v7.16b}, [x2]
|
||||
|
||||
// o0 = i0 ^ (x0 + s0)
|
||||
@@ -125,71 +141,156 @@ ENTRY(chacha20_block_xor_neon)
|
||||
|
||||
st1 {v0.16b-v3.16b}, [x1]
|
||||
|
||||
ldp x29, x30, [sp], #16
|
||||
ret
|
||||
ENDPROC(chacha20_block_xor_neon)
|
||||
ENDPROC(chacha_block_xor_neon)
|
||||
|
||||
ENTRY(hchacha_block_neon)
|
||||
// x0: Input state matrix, s
|
||||
// x1: output (8 32-bit words)
|
||||
// w2: nrounds
|
||||
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
|
||||
ld1 {v0.4s-v3.4s}, [x0]
|
||||
|
||||
mov w3, w2
|
||||
bl chacha_permute
|
||||
|
||||
st1 {v0.4s}, [x1], #16
|
||||
st1 {v3.4s}, [x1]
|
||||
|
||||
ldp x29, x30, [sp], #16
|
||||
ret
|
||||
ENDPROC(hchacha_block_neon)
|
||||
|
||||
a0 .req w12
|
||||
a1 .req w13
|
||||
a2 .req w14
|
||||
a3 .req w15
|
||||
a4 .req w16
|
||||
a5 .req w17
|
||||
a6 .req w19
|
||||
a7 .req w20
|
||||
a8 .req w21
|
||||
a9 .req w22
|
||||
a10 .req w23
|
||||
a11 .req w24
|
||||
a12 .req w25
|
||||
a13 .req w26
|
||||
a14 .req w27
|
||||
a15 .req w28
|
||||
|
||||
.align 6
|
||||
ENTRY(chacha20_4block_xor_neon)
|
||||
ENTRY(chacha_4block_xor_neon)
|
||||
frame_push 10
|
||||
|
||||
// x0: Input state matrix, s
|
||||
// x1: 4 data blocks output, o
|
||||
// x2: 4 data blocks input, i
|
||||
// w3: nrounds
|
||||
// x4: byte count
|
||||
|
||||
adr_l x10, .Lpermute
|
||||
and x5, x4, #63
|
||||
add x10, x10, x5
|
||||
add x11, x10, #64
|
||||
|
||||
//
|
||||
// This function encrypts four consecutive ChaCha20 blocks by loading
|
||||
// This function encrypts four consecutive ChaCha blocks by loading
|
||||
// the state matrix in NEON registers four times. The algorithm performs
|
||||
// each operation on the corresponding word of each state matrix, hence
|
||||
// requires no word shuffling. For final XORing step we transpose the
|
||||
// matrix by interleaving 32- and then 64-bit words, which allows us to
|
||||
// do XOR in NEON registers.
|
||||
//
|
||||
adr x3, CTRINC // ... and ROT8
|
||||
ld1 {v30.4s-v31.4s}, [x3]
|
||||
// At the same time, a fifth block is encrypted in parallel using
|
||||
// scalar registers
|
||||
//
|
||||
adr_l x9, CTRINC // ... and ROT8
|
||||
ld1 {v30.4s-v31.4s}, [x9]
|
||||
|
||||
// x0..15[0-3] = s0..3[0..3]
|
||||
mov x4, x0
|
||||
ld4r { v0.4s- v3.4s}, [x4], #16
|
||||
ld4r { v4.4s- v7.4s}, [x4], #16
|
||||
ld4r { v8.4s-v11.4s}, [x4], #16
|
||||
ld4r {v12.4s-v15.4s}, [x4]
|
||||
add x8, x0, #16
|
||||
ld4r { v0.4s- v3.4s}, [x0]
|
||||
ld4r { v4.4s- v7.4s}, [x8], #16
|
||||
ld4r { v8.4s-v11.4s}, [x8], #16
|
||||
ld4r {v12.4s-v15.4s}, [x8]
|
||||
|
||||
// x12 += counter values 0-3
|
||||
mov a0, v0.s[0]
|
||||
mov a1, v1.s[0]
|
||||
mov a2, v2.s[0]
|
||||
mov a3, v3.s[0]
|
||||
mov a4, v4.s[0]
|
||||
mov a5, v5.s[0]
|
||||
mov a6, v6.s[0]
|
||||
mov a7, v7.s[0]
|
||||
mov a8, v8.s[0]
|
||||
mov a9, v9.s[0]
|
||||
mov a10, v10.s[0]
|
||||
mov a11, v11.s[0]
|
||||
mov a12, v12.s[0]
|
||||
mov a13, v13.s[0]
|
||||
mov a14, v14.s[0]
|
||||
mov a15, v15.s[0]
|
||||
|
||||
// x12 += counter values 1-4
|
||||
add v12.4s, v12.4s, v30.4s
|
||||
|
||||
mov x3, #10
|
||||
|
||||
.Ldoubleround4:
|
||||
// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||
// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
|
||||
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
|
||||
// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
|
||||
add v0.4s, v0.4s, v4.4s
|
||||
add a0, a0, a4
|
||||
add v1.4s, v1.4s, v5.4s
|
||||
add a1, a1, a5
|
||||
add v2.4s, v2.4s, v6.4s
|
||||
add a2, a2, a6
|
||||
add v3.4s, v3.4s, v7.4s
|
||||
add a3, a3, a7
|
||||
|
||||
eor v12.16b, v12.16b, v0.16b
|
||||
eor a12, a12, a0
|
||||
eor v13.16b, v13.16b, v1.16b
|
||||
eor a13, a13, a1
|
||||
eor v14.16b, v14.16b, v2.16b
|
||||
eor a14, a14, a2
|
||||
eor v15.16b, v15.16b, v3.16b
|
||||
eor a15, a15, a3
|
||||
|
||||
rev32 v12.8h, v12.8h
|
||||
ror a12, a12, #16
|
||||
rev32 v13.8h, v13.8h
|
||||
ror a13, a13, #16
|
||||
rev32 v14.8h, v14.8h
|
||||
ror a14, a14, #16
|
||||
rev32 v15.8h, v15.8h
|
||||
ror a15, a15, #16
|
||||
|
||||
// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
|
||||
// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
|
||||
// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
|
||||
// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
|
||||
add v8.4s, v8.4s, v12.4s
|
||||
add a8, a8, a12
|
||||
add v9.4s, v9.4s, v13.4s
|
||||
add a9, a9, a13
|
||||
add v10.4s, v10.4s, v14.4s
|
||||
add a10, a10, a14
|
||||
add v11.4s, v11.4s, v15.4s
|
||||
add a11, a11, a15
|
||||
|
||||
eor v16.16b, v4.16b, v8.16b
|
||||
eor a4, a4, a8
|
||||
eor v17.16b, v5.16b, v9.16b
|
||||
eor a5, a5, a9
|
||||
eor v18.16b, v6.16b, v10.16b
|
||||
eor a6, a6, a10
|
||||
eor v19.16b, v7.16b, v11.16b
|
||||
eor a7, a7, a11
|
||||
|
||||
shl v4.4s, v16.4s, #12
|
||||
shl v5.4s, v17.4s, #12
|
||||
@@ -197,42 +298,66 @@ ENTRY(chacha20_4block_xor_neon)
|
||||
shl v7.4s, v19.4s, #12
|
||||
|
||||
sri v4.4s, v16.4s, #20
|
||||
ror a4, a4, #20
|
||||
sri v5.4s, v17.4s, #20
|
||||
ror a5, a5, #20
|
||||
sri v6.4s, v18.4s, #20
|
||||
ror a6, a6, #20
|
||||
sri v7.4s, v19.4s, #20
|
||||
ror a7, a7, #20
|
||||
|
||||
// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
|
||||
// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
|
||||
// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
|
||||
// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
|
||||
add v0.4s, v0.4s, v4.4s
|
||||
add a0, a0, a4
|
||||
add v1.4s, v1.4s, v5.4s
|
||||
add a1, a1, a5
|
||||
add v2.4s, v2.4s, v6.4s
|
||||
add a2, a2, a6
|
||||
add v3.4s, v3.4s, v7.4s
|
||||
add a3, a3, a7
|
||||
|
||||
eor v12.16b, v12.16b, v0.16b
|
||||
eor a12, a12, a0
|
||||
eor v13.16b, v13.16b, v1.16b
|
||||
eor a13, a13, a1
|
||||
eor v14.16b, v14.16b, v2.16b
|
||||
eor a14, a14, a2
|
||||
eor v15.16b, v15.16b, v3.16b
|
||||
eor a15, a15, a3
|
||||
|
||||
tbl v12.16b, {v12.16b}, v31.16b
|
||||
ror a12, a12, #24
|
||||
tbl v13.16b, {v13.16b}, v31.16b
|
||||
ror a13, a13, #24
|
||||
tbl v14.16b, {v14.16b}, v31.16b
|
||||
ror a14, a14, #24
|
||||
tbl v15.16b, {v15.16b}, v31.16b
|
||||
ror a15, a15, #24
|
||||
|
||||
// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
|
||||
// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
|
||||
// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
|
||||
// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
|
||||
add v8.4s, v8.4s, v12.4s
|
||||
add a8, a8, a12
|
||||
add v9.4s, v9.4s, v13.4s
|
||||
add a9, a9, a13
|
||||
add v10.4s, v10.4s, v14.4s
|
||||
add a10, a10, a14
|
||||
add v11.4s, v11.4s, v15.4s
|
||||
add a11, a11, a15
|
||||
|
||||
eor v16.16b, v4.16b, v8.16b
|
||||
eor a4, a4, a8
|
||||
eor v17.16b, v5.16b, v9.16b
|
||||
eor a5, a5, a9
|
||||
eor v18.16b, v6.16b, v10.16b
|
||||
eor a6, a6, a10
|
||||
eor v19.16b, v7.16b, v11.16b
|
||||
eor a7, a7, a11
|
||||
|
||||
shl v4.4s, v16.4s, #7
|
||||
shl v5.4s, v17.4s, #7
|
||||
@@ -240,42 +365,66 @@ ENTRY(chacha20_4block_xor_neon)
|
||||
shl v7.4s, v19.4s, #7
|
||||
|
||||
sri v4.4s, v16.4s, #25
|
||||
ror a4, a4, #25
|
||||
sri v5.4s, v17.4s, #25
|
||||
ror a5, a5, #25
|
||||
sri v6.4s, v18.4s, #25
|
||||
ror a6, a6, #25
|
||||
sri v7.4s, v19.4s, #25
|
||||
ror a7, a7, #25
|
||||
|
||||
// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
|
||||
// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
|
||||
// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
|
||||
// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
|
||||
add v0.4s, v0.4s, v5.4s
|
||||
add a0, a0, a5
|
||||
add v1.4s, v1.4s, v6.4s
|
||||
add a1, a1, a6
|
||||
add v2.4s, v2.4s, v7.4s
|
||||
add a2, a2, a7
|
||||
add v3.4s, v3.4s, v4.4s
|
||||
add a3, a3, a4
|
||||
|
||||
eor v15.16b, v15.16b, v0.16b
|
||||
eor a15, a15, a0
|
||||
eor v12.16b, v12.16b, v1.16b
|
||||
eor a12, a12, a1
|
||||
eor v13.16b, v13.16b, v2.16b
|
||||
eor a13, a13, a2
|
||||
eor v14.16b, v14.16b, v3.16b
|
||||
eor a14, a14, a3
|
||||
|
||||
rev32 v15.8h, v15.8h
|
||||
ror a15, a15, #16
|
||||
rev32 v12.8h, v12.8h
|
||||
ror a12, a12, #16
|
||||
rev32 v13.8h, v13.8h
|
||||
ror a13, a13, #16
|
||||
rev32 v14.8h, v14.8h
|
||||
ror a14, a14, #16
|
||||
|
||||
// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
|
||||
// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
|
||||
// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
|
||||
// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
|
||||
add v10.4s, v10.4s, v15.4s
|
||||
add a10, a10, a15
|
||||
add v11.4s, v11.4s, v12.4s
|
||||
add a11, a11, a12
|
||||
add v8.4s, v8.4s, v13.4s
|
||||
add a8, a8, a13
|
||||
add v9.4s, v9.4s, v14.4s
|
||||
add a9, a9, a14
|
||||
|
||||
eor v16.16b, v5.16b, v10.16b
|
||||
eor a5, a5, a10
|
||||
eor v17.16b, v6.16b, v11.16b
|
||||
eor a6, a6, a11
|
||||
eor v18.16b, v7.16b, v8.16b
|
||||
eor a7, a7, a8
|
||||
eor v19.16b, v4.16b, v9.16b
|
||||
eor a4, a4, a9
|
||||
|
||||
shl v5.4s, v16.4s, #12
|
||||
shl v6.4s, v17.4s, #12
|
||||
@@ -283,42 +432,66 @@ ENTRY(chacha20_4block_xor_neon)
|
||||
shl v4.4s, v19.4s, #12
|
||||
|
||||
sri v5.4s, v16.4s, #20
|
||||
ror a5, a5, #20
|
||||
sri v6.4s, v17.4s, #20
|
||||
ror a6, a6, #20
|
||||
sri v7.4s, v18.4s, #20
|
||||
ror a7, a7, #20
|
||||
sri v4.4s, v19.4s, #20
|
||||
ror a4, a4, #20
|
||||
|
||||
// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
|
||||
// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
|
||||
// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
|
||||
// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
|
||||
add v0.4s, v0.4s, v5.4s
|
||||
add a0, a0, a5
|
||||
add v1.4s, v1.4s, v6.4s
|
||||
add a1, a1, a6
|
||||
add v2.4s, v2.4s, v7.4s
|
||||
add a2, a2, a7
|
||||
add v3.4s, v3.4s, v4.4s
|
||||
add a3, a3, a4
|
||||
|
||||
eor v15.16b, v15.16b, v0.16b
|
||||
eor a15, a15, a0
|
||||
eor v12.16b, v12.16b, v1.16b
|
||||
eor a12, a12, a1
|
||||
eor v13.16b, v13.16b, v2.16b
|
||||
eor a13, a13, a2
|
||||
eor v14.16b, v14.16b, v3.16b
|
||||
eor a14, a14, a3
|
||||
|
||||
tbl v15.16b, {v15.16b}, v31.16b
|
||||
ror a15, a15, #24
|
||||
tbl v12.16b, {v12.16b}, v31.16b
|
||||
ror a12, a12, #24
|
||||
tbl v13.16b, {v13.16b}, v31.16b
|
||||
ror a13, a13, #24
|
||||
tbl v14.16b, {v14.16b}, v31.16b
|
||||
ror a14, a14, #24
|
||||
|
||||
// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
|
||||
// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
|
||||
// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
|
||||
// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
|
||||
add v10.4s, v10.4s, v15.4s
|
||||
add a10, a10, a15
|
||||
add v11.4s, v11.4s, v12.4s
|
||||
add a11, a11, a12
|
||||
add v8.4s, v8.4s, v13.4s
|
||||
add a8, a8, a13
|
||||
add v9.4s, v9.4s, v14.4s
|
||||
add a9, a9, a14
|
||||
|
||||
eor v16.16b, v5.16b, v10.16b
|
||||
eor a5, a5, a10
|
||||
eor v17.16b, v6.16b, v11.16b
|
||||
eor a6, a6, a11
|
||||
eor v18.16b, v7.16b, v8.16b
|
||||
eor a7, a7, a8
|
||||
eor v19.16b, v4.16b, v9.16b
|
||||
eor a4, a4, a9
|
||||
|
||||
shl v5.4s, v16.4s, #7
|
||||
shl v6.4s, v17.4s, #7
|
||||
@@ -326,11 +499,15 @@ ENTRY(chacha20_4block_xor_neon)
|
||||
shl v4.4s, v19.4s, #7
|
||||
|
||||
sri v5.4s, v16.4s, #25
|
||||
ror a5, a5, #25
|
||||
sri v6.4s, v17.4s, #25
|
||||
ror a6, a6, #25
|
||||
sri v7.4s, v18.4s, #25
|
||||
ror a7, a7, #25
|
||||
sri v4.4s, v19.4s, #25
|
||||
ror a4, a4, #25
|
||||
|
||||
subs x3, x3, #1
|
||||
subs w3, w3, #2
|
||||
b.ne .Ldoubleround4
|
||||
|
||||
ld4r {v16.4s-v19.4s}, [x0], #16
|
||||
@@ -344,9 +521,21 @@ ENTRY(chacha20_4block_xor_neon)
|
||||
// x2[0-3] += s0[2]
|
||||
// x3[0-3] += s0[3]
|
||||
add v0.4s, v0.4s, v16.4s
|
||||
mov w6, v16.s[0]
|
||||
mov w7, v17.s[0]
|
||||
add v1.4s, v1.4s, v17.4s
|
||||
mov w8, v18.s[0]
|
||||
mov w9, v19.s[0]
|
||||
add v2.4s, v2.4s, v18.4s
|
||||
add a0, a0, w6
|
||||
add a1, a1, w7
|
||||
add v3.4s, v3.4s, v19.4s
|
||||
add a2, a2, w8
|
||||
add a3, a3, w9
|
||||
CPU_BE( rev a0, a0 )
|
||||
CPU_BE( rev a1, a1 )
|
||||
CPU_BE( rev a2, a2 )
|
||||
CPU_BE( rev a3, a3 )
|
||||
|
||||
ld4r {v24.4s-v27.4s}, [x0], #16
|
||||
ld4r {v28.4s-v31.4s}, [x0]
|
||||
@@ -356,95 +545,316 @@ ENTRY(chacha20_4block_xor_neon)
|
||||
// x6[0-3] += s1[2]
|
||||
// x7[0-3] += s1[3]
|
||||
add v4.4s, v4.4s, v20.4s
|
||||
mov w6, v20.s[0]
|
||||
mov w7, v21.s[0]
|
||||
add v5.4s, v5.4s, v21.4s
|
||||
mov w8, v22.s[0]
|
||||
mov w9, v23.s[0]
|
||||
add v6.4s, v6.4s, v22.4s
|
||||
add a4, a4, w6
|
||||
add a5, a5, w7
|
||||
add v7.4s, v7.4s, v23.4s
|
||||
add a6, a6, w8
|
||||
add a7, a7, w9
|
||||
CPU_BE( rev a4, a4 )
|
||||
CPU_BE( rev a5, a5 )
|
||||
CPU_BE( rev a6, a6 )
|
||||
CPU_BE( rev a7, a7 )
|
||||
|
||||
// x8[0-3] += s2[0]
|
||||
// x9[0-3] += s2[1]
|
||||
// x10[0-3] += s2[2]
|
||||
// x11[0-3] += s2[3]
|
||||
add v8.4s, v8.4s, v24.4s
|
||||
mov w6, v24.s[0]
|
||||
mov w7, v25.s[0]
|
||||
add v9.4s, v9.4s, v25.4s
|
||||
mov w8, v26.s[0]
|
||||
mov w9, v27.s[0]
|
||||
add v10.4s, v10.4s, v26.4s
|
||||
add a8, a8, w6
|
||||
add a9, a9, w7
|
||||
add v11.4s, v11.4s, v27.4s
|
||||
add a10, a10, w8
|
||||
add a11, a11, w9
|
||||
CPU_BE( rev a8, a8 )
|
||||
CPU_BE( rev a9, a9 )
|
||||
CPU_BE( rev a10, a10 )
|
||||
CPU_BE( rev a11, a11 )
|
||||
|
||||
// x12[0-3] += s3[0]
|
||||
// x13[0-3] += s3[1]
|
||||
// x14[0-3] += s3[2]
|
||||
// x15[0-3] += s3[3]
|
||||
add v12.4s, v12.4s, v28.4s
|
||||
mov w6, v28.s[0]
|
||||
mov w7, v29.s[0]
|
||||
add v13.4s, v13.4s, v29.4s
|
||||
mov w8, v30.s[0]
|
||||
mov w9, v31.s[0]
|
||||
add v14.4s, v14.4s, v30.4s
|
||||
add a12, a12, w6
|
||||
add a13, a13, w7
|
||||
add v15.4s, v15.4s, v31.4s
|
||||
add a14, a14, w8
|
||||
add a15, a15, w9
|
||||
CPU_BE( rev a12, a12 )
|
||||
CPU_BE( rev a13, a13 )
|
||||
CPU_BE( rev a14, a14 )
|
||||
CPU_BE( rev a15, a15 )
|
||||
|
||||
// interleave 32-bit words in state n, n+1
|
||||
ldp w6, w7, [x2], #64
|
||||
zip1 v16.4s, v0.4s, v1.4s
|
||||
ldp w8, w9, [x2, #-56]
|
||||
eor a0, a0, w6
|
||||
zip2 v17.4s, v0.4s, v1.4s
|
||||
eor a1, a1, w7
|
||||
zip1 v18.4s, v2.4s, v3.4s
|
||||
eor a2, a2, w8
|
||||
zip2 v19.4s, v2.4s, v3.4s
|
||||
eor a3, a3, w9
|
||||
ldp w6, w7, [x2, #-48]
|
||||
zip1 v20.4s, v4.4s, v5.4s
|
||||
ldp w8, w9, [x2, #-40]
|
||||
eor a4, a4, w6
|
||||
zip2 v21.4s, v4.4s, v5.4s
|
||||
eor a5, a5, w7
|
||||
zip1 v22.4s, v6.4s, v7.4s
|
||||
eor a6, a6, w8
|
||||
zip2 v23.4s, v6.4s, v7.4s
|
||||
eor a7, a7, w9
|
||||
ldp w6, w7, [x2, #-32]
|
||||
zip1 v24.4s, v8.4s, v9.4s
|
||||
ldp w8, w9, [x2, #-24]
|
||||
eor a8, a8, w6
|
||||
zip2 v25.4s, v8.4s, v9.4s
|
||||
eor a9, a9, w7
|
||||
zip1 v26.4s, v10.4s, v11.4s
|
||||
eor a10, a10, w8
|
||||
zip2 v27.4s, v10.4s, v11.4s
|
||||
eor a11, a11, w9
|
||||
ldp w6, w7, [x2, #-16]
|
||||
zip1 v28.4s, v12.4s, v13.4s
|
||||
ldp w8, w9, [x2, #-8]
|
||||
eor a12, a12, w6
|
||||
zip2 v29.4s, v12.4s, v13.4s
|
||||
eor a13, a13, w7
|
||||
zip1 v30.4s, v14.4s, v15.4s
|
||||
eor a14, a14, w8
|
||||
zip2 v31.4s, v14.4s, v15.4s
|
||||
eor a15, a15, w9
|
||||
|
||||
mov x3, #64
|
||||
subs x5, x4, #128
|
||||
add x6, x5, x2
|
||||
csel x3, x3, xzr, ge
|
||||
csel x2, x2, x6, ge
|
||||
|
||||
// interleave 64-bit words in state n, n+2
|
||||
zip1 v0.2d, v16.2d, v18.2d
|
||||
zip2 v4.2d, v16.2d, v18.2d
|
||||
stp a0, a1, [x1], #64
|
||||
zip1 v8.2d, v17.2d, v19.2d
|
||||
zip2 v12.2d, v17.2d, v19.2d
|
||||
ld1 {v16.16b-v19.16b}, [x2], #64
|
||||
stp a2, a3, [x1, #-56]
|
||||
ld1 {v16.16b-v19.16b}, [x2], x3
|
||||
|
||||
subs x6, x4, #192
|
||||
ccmp x3, xzr, #4, lt
|
||||
add x7, x6, x2
|
||||
csel x3, x3, xzr, eq
|
||||
csel x2, x2, x7, eq
|
||||
|
||||
zip1 v1.2d, v20.2d, v22.2d
|
||||
zip2 v5.2d, v20.2d, v22.2d
|
||||
stp a4, a5, [x1, #-48]
|
||||
zip1 v9.2d, v21.2d, v23.2d
|
||||
zip2 v13.2d, v21.2d, v23.2d
|
||||
ld1 {v20.16b-v23.16b}, [x2], #64
|
||||
stp a6, a7, [x1, #-40]
|
||||
ld1 {v20.16b-v23.16b}, [x2], x3
|
||||
|
||||
subs x7, x4, #256
|
||||
ccmp x3, xzr, #4, lt
|
||||
add x8, x7, x2
|
||||
csel x3, x3, xzr, eq
|
||||
csel x2, x2, x8, eq
|
||||
|
||||
zip1 v2.2d, v24.2d, v26.2d
|
||||
zip2 v6.2d, v24.2d, v26.2d
|
||||
stp a8, a9, [x1, #-32]
|
||||
zip1 v10.2d, v25.2d, v27.2d
|
||||
zip2 v14.2d, v25.2d, v27.2d
|
||||
ld1 {v24.16b-v27.16b}, [x2], #64
|
||||
stp a10, a11, [x1, #-24]
|
||||
ld1 {v24.16b-v27.16b}, [x2], x3
|
||||
|
||||
subs x8, x4, #320
|
||||
ccmp x3, xzr, #4, lt
|
||||
add x9, x8, x2
|
||||
csel x2, x2, x9, eq
|
||||
|
||||
zip1 v3.2d, v28.2d, v30.2d
|
||||
zip2 v7.2d, v28.2d, v30.2d
|
||||
stp a12, a13, [x1, #-16]
|
||||
zip1 v11.2d, v29.2d, v31.2d
|
||||
zip2 v15.2d, v29.2d, v31.2d
|
||||
stp a14, a15, [x1, #-8]
|
||||
ld1 {v28.16b-v31.16b}, [x2]
|
||||
|
||||
// xor with corresponding input, write to output
|
||||
tbnz x5, #63, 0f
|
||||
eor v16.16b, v16.16b, v0.16b
|
||||
eor v17.16b, v17.16b, v1.16b
|
||||
eor v18.16b, v18.16b, v2.16b
|
||||
eor v19.16b, v19.16b, v3.16b
|
||||
st1 {v16.16b-v19.16b}, [x1], #64
|
||||
cbz x5, .Lout
|
||||
|
||||
tbnz x6, #63, 1f
|
||||
eor v20.16b, v20.16b, v4.16b
|
||||
eor v21.16b, v21.16b, v5.16b
|
||||
st1 {v16.16b-v19.16b}, [x1], #64
|
||||
eor v22.16b, v22.16b, v6.16b
|
||||
eor v23.16b, v23.16b, v7.16b
|
||||
st1 {v20.16b-v23.16b}, [x1], #64
|
||||
cbz x6, .Lout
|
||||
|
||||
tbnz x7, #63, 2f
|
||||
eor v24.16b, v24.16b, v8.16b
|
||||
eor v25.16b, v25.16b, v9.16b
|
||||
st1 {v20.16b-v23.16b}, [x1], #64
|
||||
eor v26.16b, v26.16b, v10.16b
|
||||
eor v27.16b, v27.16b, v11.16b
|
||||
eor v28.16b, v28.16b, v12.16b
|
||||
st1 {v24.16b-v27.16b}, [x1], #64
|
||||
cbz x7, .Lout
|
||||
|
||||
tbnz x8, #63, 3f
|
||||
eor v28.16b, v28.16b, v12.16b
|
||||
eor v29.16b, v29.16b, v13.16b
|
||||
eor v30.16b, v30.16b, v14.16b
|
||||
eor v31.16b, v31.16b, v15.16b
|
||||
st1 {v28.16b-v31.16b}, [x1]
|
||||
|
||||
.Lout: frame_pop
|
||||
ret
|
||||
ENDPROC(chacha20_4block_xor_neon)
|
||||
|
||||
CTRINC: .word 0, 1, 2, 3
|
||||
// fewer than 128 bytes of in/output
|
||||
0: ld1 {v8.16b}, [x10]
|
||||
ld1 {v9.16b}, [x11]
|
||||
movi v10.16b, #16
|
||||
sub x2, x1, #64
|
||||
add x1, x1, x5
|
||||
ld1 {v16.16b-v19.16b}, [x2]
|
||||
tbl v4.16b, {v0.16b-v3.16b}, v8.16b
|
||||
tbx v20.16b, {v16.16b-v19.16b}, v9.16b
|
||||
add v8.16b, v8.16b, v10.16b
|
||||
add v9.16b, v9.16b, v10.16b
|
||||
tbl v5.16b, {v0.16b-v3.16b}, v8.16b
|
||||
tbx v21.16b, {v16.16b-v19.16b}, v9.16b
|
||||
add v8.16b, v8.16b, v10.16b
|
||||
add v9.16b, v9.16b, v10.16b
|
||||
tbl v6.16b, {v0.16b-v3.16b}, v8.16b
|
||||
tbx v22.16b, {v16.16b-v19.16b}, v9.16b
|
||||
add v8.16b, v8.16b, v10.16b
|
||||
add v9.16b, v9.16b, v10.16b
|
||||
tbl v7.16b, {v0.16b-v3.16b}, v8.16b
|
||||
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
|
||||
|
||||
eor v20.16b, v20.16b, v4.16b
|
||||
eor v21.16b, v21.16b, v5.16b
|
||||
eor v22.16b, v22.16b, v6.16b
|
||||
eor v23.16b, v23.16b, v7.16b
|
||||
st1 {v20.16b-v23.16b}, [x1]
|
||||
b .Lout
|
||||
|
||||
// fewer than 192 bytes of in/output
|
||||
1: ld1 {v8.16b}, [x10]
|
||||
ld1 {v9.16b}, [x11]
|
||||
movi v10.16b, #16
|
||||
add x1, x1, x6
|
||||
tbl v0.16b, {v4.16b-v7.16b}, v8.16b
|
||||
tbx v20.16b, {v16.16b-v19.16b}, v9.16b
|
||||
add v8.16b, v8.16b, v10.16b
|
||||
add v9.16b, v9.16b, v10.16b
|
||||
tbl v1.16b, {v4.16b-v7.16b}, v8.16b
|
||||
tbx v21.16b, {v16.16b-v19.16b}, v9.16b
|
||||
add v8.16b, v8.16b, v10.16b
|
||||
add v9.16b, v9.16b, v10.16b
|
||||
tbl v2.16b, {v4.16b-v7.16b}, v8.16b
|
||||
tbx v22.16b, {v16.16b-v19.16b}, v9.16b
|
||||
add v8.16b, v8.16b, v10.16b
|
||||
add v9.16b, v9.16b, v10.16b
|
||||
tbl v3.16b, {v4.16b-v7.16b}, v8.16b
|
||||
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
|
||||
|
||||
eor v20.16b, v20.16b, v0.16b
|
||||
eor v21.16b, v21.16b, v1.16b
|
||||
eor v22.16b, v22.16b, v2.16b
|
||||
eor v23.16b, v23.16b, v3.16b
|
||||
st1 {v20.16b-v23.16b}, [x1]
|
||||
b .Lout
|
||||
|
||||
// fewer than 256 bytes of in/output
|
||||
2: ld1 {v4.16b}, [x10]
|
||||
ld1 {v5.16b}, [x11]
|
||||
movi v6.16b, #16
|
||||
add x1, x1, x7
|
||||
tbl v0.16b, {v8.16b-v11.16b}, v4.16b
|
||||
tbx v24.16b, {v20.16b-v23.16b}, v5.16b
|
||||
add v4.16b, v4.16b, v6.16b
|
||||
add v5.16b, v5.16b, v6.16b
|
||||
tbl v1.16b, {v8.16b-v11.16b}, v4.16b
|
||||
tbx v25.16b, {v20.16b-v23.16b}, v5.16b
|
||||
add v4.16b, v4.16b, v6.16b
|
||||
add v5.16b, v5.16b, v6.16b
|
||||
tbl v2.16b, {v8.16b-v11.16b}, v4.16b
|
||||
tbx v26.16b, {v20.16b-v23.16b}, v5.16b
|
||||
add v4.16b, v4.16b, v6.16b
|
||||
add v5.16b, v5.16b, v6.16b
|
||||
tbl v3.16b, {v8.16b-v11.16b}, v4.16b
|
||||
tbx v27.16b, {v20.16b-v23.16b}, v5.16b
|
||||
|
||||
eor v24.16b, v24.16b, v0.16b
|
||||
eor v25.16b, v25.16b, v1.16b
|
||||
eor v26.16b, v26.16b, v2.16b
|
||||
eor v27.16b, v27.16b, v3.16b
|
||||
st1 {v24.16b-v27.16b}, [x1]
|
||||
b .Lout
|
||||
|
||||
// fewer than 320 bytes of in/output
|
||||
3: ld1 {v4.16b}, [x10]
|
||||
ld1 {v5.16b}, [x11]
|
||||
movi v6.16b, #16
|
||||
add x1, x1, x8
|
||||
tbl v0.16b, {v12.16b-v15.16b}, v4.16b
|
||||
tbx v28.16b, {v24.16b-v27.16b}, v5.16b
|
||||
add v4.16b, v4.16b, v6.16b
|
||||
add v5.16b, v5.16b, v6.16b
|
||||
tbl v1.16b, {v12.16b-v15.16b}, v4.16b
|
||||
tbx v29.16b, {v24.16b-v27.16b}, v5.16b
|
||||
add v4.16b, v4.16b, v6.16b
|
||||
add v5.16b, v5.16b, v6.16b
|
||||
tbl v2.16b, {v12.16b-v15.16b}, v4.16b
|
||||
tbx v30.16b, {v24.16b-v27.16b}, v5.16b
|
||||
add v4.16b, v4.16b, v6.16b
|
||||
add v5.16b, v5.16b, v6.16b
|
||||
tbl v3.16b, {v12.16b-v15.16b}, v4.16b
|
||||
tbx v31.16b, {v24.16b-v27.16b}, v5.16b
|
||||
|
||||
eor v28.16b, v28.16b, v0.16b
|
||||
eor v29.16b, v29.16b, v1.16b
|
||||
eor v30.16b, v30.16b, v2.16b
|
||||
eor v31.16b, v31.16b, v3.16b
|
||||
st1 {v28.16b-v31.16b}, [x1]
|
||||
b .Lout
|
||||
ENDPROC(chacha_4block_xor_neon)
|
||||
|
||||
.section ".rodata", "a", %progbits
|
||||
.align L1_CACHE_SHIFT
|
||||
.Lpermute:
|
||||
.set .Li, 0
|
||||
.rept 192
|
||||
.byte (.Li - 64)
|
||||
.set .Li, .Li + 1
|
||||
.endr
|
||||
|
||||
CTRINC: .word 1, 2, 3, 4
|
||||
ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
|
||||
@@ -1,8 +1,8 @@
|
||||
/*
|
||||
* ARM NEON accelerated ChaCha and XChaCha stream ciphers,
|
||||
* ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
|
||||
* including ChaCha20 (RFC7539)
|
||||
*
|
||||
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
@@ -20,8 +20,9 @@
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/chacha.h>
|
||||
#include <crypto/internal/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
@@ -29,40 +30,78 @@
|
||||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
||||
asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
|
||||
int nrounds);
|
||||
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
||||
int nrounds);
|
||||
asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
|
||||
int nrounds, int bytes);
|
||||
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
|
||||
|
||||
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes, int nrounds)
|
||||
{
|
||||
u8 buf[CHACHA_BLOCK_SIZE];
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
|
||||
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha_4block_xor_neon(state, dst, src, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||
src += CHACHA_BLOCK_SIZE * 4;
|
||||
dst += CHACHA_BLOCK_SIZE * 4;
|
||||
state[12] += 4;
|
||||
}
|
||||
while (bytes >= CHACHA_BLOCK_SIZE) {
|
||||
chacha_block_xor_neon(state, dst, src, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE;
|
||||
src += CHACHA_BLOCK_SIZE;
|
||||
dst += CHACHA_BLOCK_SIZE;
|
||||
state[12]++;
|
||||
}
|
||||
if (bytes) {
|
||||
memcpy(buf, src, bytes);
|
||||
chacha_block_xor_neon(state, buf, buf, nrounds);
|
||||
memcpy(dst, buf, bytes);
|
||||
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
|
||||
int bytes, int nrounds)
|
||||
{
|
||||
while (bytes > 0) {
|
||||
int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
|
||||
|
||||
if (l <= CHACHA_BLOCK_SIZE) {
|
||||
u8 buf[CHACHA_BLOCK_SIZE];
|
||||
|
||||
memcpy(buf, src, l);
|
||||
chacha_block_xor_neon(state, buf, buf, nrounds);
|
||||
memcpy(dst, buf, l);
|
||||
state[12] += 1;
|
||||
break;
|
||||
}
|
||||
chacha_4block_xor_neon(state, dst, src, nrounds, l);
|
||||
bytes -= l;
|
||||
src += l;
|
||||
dst += l;
|
||||
state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
|
||||
{
|
||||
if (!static_branch_likely(&have_neon) || !may_use_simd()) {
|
||||
hchacha_block_generic(state, stream, nrounds);
|
||||
} else {
|
||||
kernel_neon_begin();
|
||||
hchacha_block_neon(state, stream, nrounds);
|
||||
kernel_neon_end();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(hchacha_block_arch);
|
||||
|
||||
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
||||
{
|
||||
chacha_init_generic(state, key, iv);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_init_arch);
|
||||
|
||||
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
|
||||
int nrounds)
|
||||
{
|
||||
if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
|
||||
!may_use_simd())
|
||||
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
|
||||
|
||||
do {
|
||||
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
|
||||
|
||||
kernel_neon_begin();
|
||||
chacha_doneon(state, dst, src, todo, nrounds);
|
||||
kernel_neon_end();
|
||||
|
||||
bytes -= todo;
|
||||
src += todo;
|
||||
dst += todo;
|
||||
} while (bytes);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_crypt_arch);
|
||||
|
||||
static int chacha_neon_stream_xor(struct skcipher_request *req,
|
||||
struct chacha_ctx *ctx, u8 *iv)
|
||||
const struct chacha_ctx *ctx, const u8 *iv)
|
||||
{
|
||||
struct skcipher_walk walk;
|
||||
u32 state[16];
|
||||
@@ -70,18 +109,25 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
crypto_chacha_init(state, ctx, iv);
|
||||
chacha_init_generic(state, ctx->key, iv);
|
||||
|
||||
while (walk.nbytes > 0) {
|
||||
unsigned int nbytes = walk.nbytes;
|
||||
|
||||
if (nbytes < walk.total)
|
||||
nbytes = round_down(nbytes, walk.stride);
|
||||
nbytes = rounddown(nbytes, walk.stride);
|
||||
|
||||
kernel_neon_begin();
|
||||
chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
nbytes, ctx->nrounds);
|
||||
kernel_neon_end();
|
||||
if (!static_branch_likely(&have_neon) ||
|
||||
!may_use_simd()) {
|
||||
chacha_crypt_generic(state, walk.dst.virt.addr,
|
||||
walk.src.virt.addr, nbytes,
|
||||
ctx->nrounds);
|
||||
} else {
|
||||
kernel_neon_begin();
|
||||
chacha_doneon(state, walk.dst.virt.addr,
|
||||
walk.src.virt.addr, nbytes, ctx->nrounds);
|
||||
kernel_neon_end();
|
||||
}
|
||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||
}
|
||||
|
||||
@@ -93,9 +139,6 @@ static int chacha_neon(struct skcipher_request *req)
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
|
||||
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
|
||||
return crypto_chacha_crypt(req);
|
||||
|
||||
return chacha_neon_stream_xor(req, ctx, req->iv);
|
||||
}
|
||||
|
||||
@@ -107,14 +150,8 @@ static int xchacha_neon(struct skcipher_request *req)
|
||||
u32 state[16];
|
||||
u8 real_iv[16];
|
||||
|
||||
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
|
||||
return crypto_xchacha_crypt(req);
|
||||
|
||||
crypto_chacha_init(state, ctx, req->iv);
|
||||
|
||||
kernel_neon_begin();
|
||||
hchacha_block_neon(state, subctx.key, ctx->nrounds);
|
||||
kernel_neon_end();
|
||||
chacha_init_generic(state, ctx->key, req->iv);
|
||||
hchacha_block_arch(state, subctx.key, ctx->nrounds);
|
||||
subctx.nrounds = ctx->nrounds;
|
||||
|
||||
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||
@@ -135,8 +172,8 @@ static struct skcipher_alg algs[] = {
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.walksize = 5 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = chacha_neon,
|
||||
.decrypt = chacha_neon,
|
||||
}, {
|
||||
@@ -151,8 +188,8 @@ static struct skcipher_alg algs[] = {
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.walksize = 5 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = xchacha_neon,
|
||||
.decrypt = xchacha_neon,
|
||||
}, {
|
||||
@@ -167,8 +204,8 @@ static struct skcipher_alg algs[] = {
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha12_setkey,
|
||||
.walksize = 5 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha12_setkey,
|
||||
.encrypt = xchacha_neon,
|
||||
.decrypt = xchacha_neon,
|
||||
}
|
||||
@@ -176,15 +213,19 @@ static struct skcipher_alg algs[] = {
|
||||
|
||||
static int __init chacha_simd_mod_init(void)
|
||||
{
|
||||
if (!(elf_hwcap & HWCAP_NEON))
|
||||
return -ENODEV;
|
||||
if (!(elf_hwcap & HWCAP_ASIMD))
|
||||
return 0;
|
||||
|
||||
return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
|
||||
static_branch_enable(&have_neon);
|
||||
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
|
||||
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
|
||||
}
|
||||
|
||||
static void __exit chacha_simd_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && (elf_hwcap & HWCAP_ASIMD))
|
||||
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
||||
}
|
||||
|
||||
module_init(chacha_simd_mod_init);
|
||||
@@ -1,133 +0,0 @@
|
||||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
|
||||
*
|
||||
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Based on:
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include <asm/hwcap.h>
|
||||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
|
||||
asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
|
||||
|
||||
static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes)
|
||||
{
|
||||
u8 buf[CHACHA_BLOCK_SIZE];
|
||||
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||
kernel_neon_begin();
|
||||
chacha20_4block_xor_neon(state, dst, src);
|
||||
kernel_neon_end();
|
||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||
src += CHACHA_BLOCK_SIZE * 4;
|
||||
dst += CHACHA_BLOCK_SIZE * 4;
|
||||
state[12] += 4;
|
||||
}
|
||||
|
||||
if (!bytes)
|
||||
return;
|
||||
|
||||
kernel_neon_begin();
|
||||
while (bytes >= CHACHA_BLOCK_SIZE) {
|
||||
chacha20_block_xor_neon(state, dst, src);
|
||||
bytes -= CHACHA_BLOCK_SIZE;
|
||||
src += CHACHA_BLOCK_SIZE;
|
||||
dst += CHACHA_BLOCK_SIZE;
|
||||
state[12]++;
|
||||
}
|
||||
if (bytes) {
|
||||
memcpy(buf, src, bytes);
|
||||
chacha20_block_xor_neon(state, buf, buf);
|
||||
memcpy(dst, buf, bytes);
|
||||
}
|
||||
kernel_neon_end();
|
||||
}
|
||||
|
||||
static int chacha20_neon(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
struct skcipher_walk walk;
|
||||
u32 state[16];
|
||||
int err;
|
||||
|
||||
if (!may_use_simd() || req->cryptlen <= CHACHA_BLOCK_SIZE)
|
||||
return crypto_chacha_crypt(req);
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
crypto_chacha_init(state, ctx, walk.iv);
|
||||
|
||||
while (walk.nbytes > 0) {
|
||||
unsigned int nbytes = walk.nbytes;
|
||||
|
||||
if (nbytes < walk.total)
|
||||
nbytes = round_down(nbytes, walk.stride);
|
||||
|
||||
chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
nbytes);
|
||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct skcipher_alg alg = {
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-neon",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.encrypt = chacha20_neon,
|
||||
.decrypt = chacha20_neon,
|
||||
};
|
||||
|
||||
static int __init chacha20_simd_mod_init(void)
|
||||
{
|
||||
if (!(elf_hwcap & HWCAP_ASIMD))
|
||||
return -ENODEV;
|
||||
|
||||
return crypto_register_skcipher(&alg);
|
||||
}
|
||||
|
||||
static void __exit chacha20_simd_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_skcipher(&alg);
|
||||
}
|
||||
|
||||
module_init(chacha20_simd_mod_init);
|
||||
module_exit(chacha20_simd_mod_fini);
|
||||
|
||||
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_ALIAS_CRYPTO("chacha20");
|
||||
913
arch/arm64/crypto/poly1305-armv8.pl
Normal file
913
arch/arm64/crypto/poly1305-armv8.pl
Normal file
@@ -0,0 +1,913 @@
|
||||
#!/usr/bin/env perl
|
||||
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
|
||||
# project.
|
||||
# ====================================================================
|
||||
#
|
||||
# This module implements Poly1305 hash for ARMv8.
|
||||
#
|
||||
# June 2015
|
||||
#
|
||||
# Numbers are cycles per processed byte with poly1305_blocks alone.
|
||||
#
|
||||
# IALU/gcc-4.9 NEON
|
||||
#
|
||||
# Apple A7 1.86/+5% 0.72
|
||||
# Cortex-A53 2.69/+58% 1.47
|
||||
# Cortex-A57 2.70/+7% 1.14
|
||||
# Denver 1.64/+50% 1.18(*)
|
||||
# X-Gene 2.13/+68% 2.27
|
||||
# Mongoose 1.77/+75% 1.12
|
||||
# Kryo 2.70/+55% 1.13
|
||||
# ThunderX2 1.17/+95% 1.36
|
||||
#
|
||||
# (*) estimate based on resources availability is less than 1.0,
|
||||
# i.e. measured result is worse than expected, presumably binary
|
||||
# translator is not almighty;
|
||||
|
||||
$flavour=shift;
|
||||
$output=shift;
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
|
||||
my ($mac,$nonce)=($inp,$len);
|
||||
|
||||
my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
|
||||
|
||||
$code.=<<___;
|
||||
#ifndef __KERNEL__
|
||||
# include "arm_arch.h"
|
||||
.extern OPENSSL_armcap_P
|
||||
#endif
|
||||
|
||||
.text
|
||||
|
||||
// forward "declarations" are required for Apple
|
||||
.globl poly1305_blocks
|
||||
.globl poly1305_emit
|
||||
|
||||
.globl poly1305_init
|
||||
.type poly1305_init,%function
|
||||
.align 5
|
||||
poly1305_init:
|
||||
cmp $inp,xzr
|
||||
stp xzr,xzr,[$ctx] // zero hash value
|
||||
stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
|
||||
|
||||
csel x0,xzr,x0,eq
|
||||
b.eq .Lno_key
|
||||
|
||||
#ifndef __KERNEL__
|
||||
adrp x17,OPENSSL_armcap_P
|
||||
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
|
||||
#endif
|
||||
|
||||
ldp $r0,$r1,[$inp] // load key
|
||||
mov $s1,#0xfffffffc0fffffff
|
||||
movk $s1,#0x0fff,lsl#48
|
||||
#ifdef __AARCH64EB__
|
||||
rev $r0,$r0 // flip bytes
|
||||
rev $r1,$r1
|
||||
#endif
|
||||
and $r0,$r0,$s1 // &=0ffffffc0fffffff
|
||||
and $s1,$s1,#-4
|
||||
and $r1,$r1,$s1 // &=0ffffffc0ffffffc
|
||||
mov w#$s1,#-1
|
||||
stp $r0,$r1,[$ctx,#32] // save key value
|
||||
str w#$s1,[$ctx,#48] // impossible key power value
|
||||
|
||||
#ifndef __KERNEL__
|
||||
tst w17,#ARMV7_NEON
|
||||
|
||||
adr $d0,.Lpoly1305_blocks
|
||||
adr $r0,.Lpoly1305_blocks_neon
|
||||
adr $d1,.Lpoly1305_emit
|
||||
|
||||
csel $d0,$d0,$r0,eq
|
||||
|
||||
# ifdef __ILP32__
|
||||
stp w#$d0,w#$d1,[$len]
|
||||
# else
|
||||
stp $d0,$d1,[$len]
|
||||
# endif
|
||||
#endif
|
||||
mov x0,#1
|
||||
.Lno_key:
|
||||
ret
|
||||
.size poly1305_init,.-poly1305_init
|
||||
|
||||
.type poly1305_blocks,%function
|
||||
.align 5
|
||||
poly1305_blocks:
|
||||
.Lpoly1305_blocks:
|
||||
ands $len,$len,#-16
|
||||
b.eq .Lno_data
|
||||
|
||||
ldp $h0,$h1,[$ctx] // load hash value
|
||||
ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
|
||||
ldp $r0,$r1,[$ctx,#32] // load key value
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
lsr $d0,$h0,#32
|
||||
mov w#$d1,w#$h0
|
||||
lsr $d2,$h1,#32
|
||||
mov w15,w#$h1
|
||||
lsr x16,$h2,#32
|
||||
#else
|
||||
mov w#$d0,w#$h0
|
||||
lsr $d1,$h0,#32
|
||||
mov w#$d2,w#$h1
|
||||
lsr x15,$h1,#32
|
||||
mov w16,w#$h2
|
||||
#endif
|
||||
|
||||
add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr $d1,$d2,#12
|
||||
adds $d0,$d0,$d2,lsl#52
|
||||
add $d1,$d1,x15,lsl#14
|
||||
adc $d1,$d1,xzr
|
||||
lsr $d2,x16,#24
|
||||
adds $d1,$d1,x16,lsl#40
|
||||
adc $d2,$d2,xzr
|
||||
|
||||
cmp x17,#0 // is_base2_26?
|
||||
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
csel $h0,$h0,$d0,eq // choose between radixes
|
||||
csel $h1,$h1,$d1,eq
|
||||
csel $h2,$h2,$d2,eq
|
||||
|
||||
.Loop:
|
||||
ldp $t0,$t1,[$inp],#16 // load input
|
||||
sub $len,$len,#16
|
||||
#ifdef __AARCH64EB__
|
||||
rev $t0,$t0
|
||||
rev $t1,$t1
|
||||
#endif
|
||||
adds $h0,$h0,$t0 // accumulate input
|
||||
adcs $h1,$h1,$t1
|
||||
|
||||
mul $d0,$h0,$r0 // h0*r0
|
||||
adc $h2,$h2,$padbit
|
||||
umulh $d1,$h0,$r0
|
||||
|
||||
mul $t0,$h1,$s1 // h1*5*r1
|
||||
umulh $t1,$h1,$s1
|
||||
|
||||
adds $d0,$d0,$t0
|
||||
mul $t0,$h0,$r1 // h0*r1
|
||||
adc $d1,$d1,$t1
|
||||
umulh $d2,$h0,$r1
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
mul $t0,$h1,$r0 // h1*r0
|
||||
adc $d2,$d2,xzr
|
||||
umulh $t1,$h1,$r0
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
mul $t0,$h2,$s1 // h2*5*r1
|
||||
adc $d2,$d2,$t1
|
||||
mul $t1,$h2,$r0 // h2*r0
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
adc $d2,$d2,$t1
|
||||
|
||||
and $t0,$d2,#-4 // final reduction
|
||||
and $h2,$d2,#3
|
||||
add $t0,$t0,$d2,lsr#2
|
||||
adds $h0,$d0,$t0
|
||||
adcs $h1,$d1,xzr
|
||||
adc $h2,$h2,xzr
|
||||
|
||||
cbnz $len,.Loop
|
||||
|
||||
stp $h0,$h1,[$ctx] // store hash value
|
||||
stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
|
||||
|
||||
.Lno_data:
|
||||
ret
|
||||
.size poly1305_blocks,.-poly1305_blocks
|
||||
|
||||
.type poly1305_emit,%function
|
||||
.align 5
|
||||
poly1305_emit:
|
||||
.Lpoly1305_emit:
|
||||
ldp $h0,$h1,[$ctx] // load hash base 2^64
|
||||
ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
|
||||
ldp $t0,$t1,[$nonce] // load nonce
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
lsr $d0,$h0,#32
|
||||
mov w#$d1,w#$h0
|
||||
lsr $d2,$h1,#32
|
||||
mov w15,w#$h1
|
||||
lsr x16,$h2,#32
|
||||
#else
|
||||
mov w#$d0,w#$h0
|
||||
lsr $d1,$h0,#32
|
||||
mov w#$d2,w#$h1
|
||||
lsr x15,$h1,#32
|
||||
mov w16,w#$h2
|
||||
#endif
|
||||
|
||||
add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr $d1,$d2,#12
|
||||
adds $d0,$d0,$d2,lsl#52
|
||||
add $d1,$d1,x15,lsl#14
|
||||
adc $d1,$d1,xzr
|
||||
lsr $d2,x16,#24
|
||||
adds $d1,$d1,x16,lsl#40
|
||||
adc $d2,$d2,xzr
|
||||
|
||||
cmp $r0,#0 // is_base2_26?
|
||||
csel $h0,$h0,$d0,eq // choose between radixes
|
||||
csel $h1,$h1,$d1,eq
|
||||
csel $h2,$h2,$d2,eq
|
||||
|
||||
adds $d0,$h0,#5 // compare to modulus
|
||||
adcs $d1,$h1,xzr
|
||||
adc $d2,$h2,xzr
|
||||
|
||||
tst $d2,#-4 // see if it's carried/borrowed
|
||||
|
||||
csel $h0,$h0,$d0,eq
|
||||
csel $h1,$h1,$d1,eq
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
ror $t0,$t0,#32 // flip nonce words
|
||||
ror $t1,$t1,#32
|
||||
#endif
|
||||
adds $h0,$h0,$t0 // accumulate nonce
|
||||
adc $h1,$h1,$t1
|
||||
#ifdef __AARCH64EB__
|
||||
rev $h0,$h0 // flip output bytes
|
||||
rev $h1,$h1
|
||||
#endif
|
||||
stp $h0,$h1,[$mac] // write result
|
||||
|
||||
ret
|
||||
.size poly1305_emit,.-poly1305_emit
|
||||
___
|
||||
my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
|
||||
my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
|
||||
my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
|
||||
my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
|
||||
my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
|
||||
my ($T0,$T1,$MASK) = map("v$_",(29..31));
|
||||
|
||||
my ($in2,$zeros)=("x16","x17");
|
||||
my $is_base2_26 = $zeros; # borrow
|
||||
|
||||
$code.=<<___;
|
||||
.type poly1305_mult,%function
|
||||
.align 5
|
||||
poly1305_mult:
|
||||
mul $d0,$h0,$r0 // h0*r0
|
||||
umulh $d1,$h0,$r0
|
||||
|
||||
mul $t0,$h1,$s1 // h1*5*r1
|
||||
umulh $t1,$h1,$s1
|
||||
|
||||
adds $d0,$d0,$t0
|
||||
mul $t0,$h0,$r1 // h0*r1
|
||||
adc $d1,$d1,$t1
|
||||
umulh $d2,$h0,$r1
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
mul $t0,$h1,$r0 // h1*r0
|
||||
adc $d2,$d2,xzr
|
||||
umulh $t1,$h1,$r0
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
mul $t0,$h2,$s1 // h2*5*r1
|
||||
adc $d2,$d2,$t1
|
||||
mul $t1,$h2,$r0 // h2*r0
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
adc $d2,$d2,$t1
|
||||
|
||||
and $t0,$d2,#-4 // final reduction
|
||||
and $h2,$d2,#3
|
||||
add $t0,$t0,$d2,lsr#2
|
||||
adds $h0,$d0,$t0
|
||||
adcs $h1,$d1,xzr
|
||||
adc $h2,$h2,xzr
|
||||
|
||||
ret
|
||||
.size poly1305_mult,.-poly1305_mult
|
||||
|
||||
.type poly1305_splat,%function
|
||||
.align 4
|
||||
poly1305_splat:
|
||||
and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x13,$h0,#26,#26
|
||||
extr x14,$h1,$h0,#52
|
||||
and x14,x14,#0x03ffffff
|
||||
ubfx x15,$h1,#14,#26
|
||||
extr x16,$h2,$h1,#40
|
||||
|
||||
str w12,[$ctx,#16*0] // r0
|
||||
add w12,w13,w13,lsl#2 // r1*5
|
||||
str w13,[$ctx,#16*1] // r1
|
||||
add w13,w14,w14,lsl#2 // r2*5
|
||||
str w12,[$ctx,#16*2] // s1
|
||||
str w14,[$ctx,#16*3] // r2
|
||||
add w14,w15,w15,lsl#2 // r3*5
|
||||
str w13,[$ctx,#16*4] // s2
|
||||
str w15,[$ctx,#16*5] // r3
|
||||
add w15,w16,w16,lsl#2 // r4*5
|
||||
str w14,[$ctx,#16*6] // s3
|
||||
str w16,[$ctx,#16*7] // r4
|
||||
str w15,[$ctx,#16*8] // s4
|
||||
|
||||
ret
|
||||
.size poly1305_splat,.-poly1305_splat
|
||||
|
||||
#ifdef __KERNEL__
|
||||
.globl poly1305_blocks_neon
|
||||
#endif
|
||||
.type poly1305_blocks_neon,%function
|
||||
.align 5
|
||||
poly1305_blocks_neon:
|
||||
.Lpoly1305_blocks_neon:
|
||||
ldr $is_base2_26,[$ctx,#24]
|
||||
cmp $len,#128
|
||||
b.lo .Lpoly1305_blocks
|
||||
|
||||
.inst 0xd503233f // paciasp
|
||||
stp x29,x30,[sp,#-80]!
|
||||
add x29,sp,#0
|
||||
|
||||
stp d8,d9,[sp,#16] // meet ABI requirements
|
||||
stp d10,d11,[sp,#32]
|
||||
stp d12,d13,[sp,#48]
|
||||
stp d14,d15,[sp,#64]
|
||||
|
||||
cbz $is_base2_26,.Lbase2_64_neon
|
||||
|
||||
ldp w10,w11,[$ctx] // load hash value base 2^26
|
||||
ldp w12,w13,[$ctx,#8]
|
||||
ldr w14,[$ctx,#16]
|
||||
|
||||
tst $len,#31
|
||||
b.eq .Leven_neon
|
||||
|
||||
ldp $r0,$r1,[$ctx,#32] // load key value
|
||||
|
||||
add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr $h1,x12,#12
|
||||
adds $h0,$h0,x12,lsl#52
|
||||
add $h1,$h1,x13,lsl#14
|
||||
adc $h1,$h1,xzr
|
||||
lsr $h2,x14,#24
|
||||
adds $h1,$h1,x14,lsl#40
|
||||
adc $d2,$h2,xzr // can be partially reduced...
|
||||
|
||||
ldp $d0,$d1,[$inp],#16 // load input
|
||||
sub $len,$len,#16
|
||||
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev $d0,$d0
|
||||
rev $d1,$d1
|
||||
#endif
|
||||
adds $h0,$h0,$d0 // accumulate input
|
||||
adcs $h1,$h1,$d1
|
||||
adc $h2,$h2,$padbit
|
||||
|
||||
bl poly1305_mult
|
||||
|
||||
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,$h0,#26,#26
|
||||
extr x12,$h1,$h0,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,$h1,#14,#26
|
||||
extr x14,$h2,$h1,#40
|
||||
|
||||
b .Leven_neon
|
||||
|
||||
.align 4
|
||||
.Lbase2_64_neon:
|
||||
ldp $r0,$r1,[$ctx,#32] // load key value
|
||||
|
||||
ldp $h0,$h1,[$ctx] // load hash value base 2^64
|
||||
ldr $h2,[$ctx,#16]
|
||||
|
||||
tst $len,#31
|
||||
b.eq .Linit_neon
|
||||
|
||||
ldp $d0,$d1,[$inp],#16 // load input
|
||||
sub $len,$len,#16
|
||||
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
#ifdef __AARCH64EB__
|
||||
rev $d0,$d0
|
||||
rev $d1,$d1
|
||||
#endif
|
||||
adds $h0,$h0,$d0 // accumulate input
|
||||
adcs $h1,$h1,$d1
|
||||
adc $h2,$h2,$padbit
|
||||
|
||||
bl poly1305_mult
|
||||
|
||||
.Linit_neon:
|
||||
ldr w17,[$ctx,#48] // first table element
|
||||
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,$h0,#26,#26
|
||||
extr x12,$h1,$h0,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,$h1,#14,#26
|
||||
extr x14,$h2,$h1,#40
|
||||
|
||||
cmp w17,#-1 // is value impossible?
|
||||
b.ne .Leven_neon
|
||||
|
||||
fmov ${H0},x10
|
||||
fmov ${H1},x11
|
||||
fmov ${H2},x12
|
||||
fmov ${H3},x13
|
||||
fmov ${H4},x14
|
||||
|
||||
////////////////////////////////// initialize r^n table
|
||||
mov $h0,$r0 // r^1
|
||||
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
mov $h1,$r1
|
||||
mov $h2,xzr
|
||||
add $ctx,$ctx,#48+12
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^2
|
||||
sub $ctx,$ctx,#4
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^3
|
||||
sub $ctx,$ctx,#4
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^4
|
||||
sub $ctx,$ctx,#4
|
||||
bl poly1305_splat
|
||||
sub $ctx,$ctx,#48 // restore original $ctx
|
||||
b .Ldo_neon
|
||||
|
||||
.align 4
|
||||
.Leven_neon:
|
||||
fmov ${H0},x10
|
||||
fmov ${H1},x11
|
||||
fmov ${H2},x12
|
||||
fmov ${H3},x13
|
||||
fmov ${H4},x14
|
||||
|
||||
.Ldo_neon:
|
||||
ldp x8,x12,[$inp,#32] // inp[2:3]
|
||||
subs $len,$len,#64
|
||||
ldp x9,x13,[$inp,#48]
|
||||
add $in2,$inp,#96
|
||||
adr $zeros,.Lzeros
|
||||
|
||||
lsl $padbit,$padbit,#24
|
||||
add x15,$ctx,#48
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov $IN23_0,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,$padbit,x12,lsr#40
|
||||
add x13,$padbit,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov $IN23_1,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
fmov $IN23_2,x8
|
||||
fmov $IN23_3,x10
|
||||
fmov $IN23_4,x12
|
||||
|
||||
ldp x8,x12,[$inp],#16 // inp[0:1]
|
||||
ldp x9,x13,[$inp],#48
|
||||
|
||||
ld1 {$R0,$R1,$S1,$R2},[x15],#64
|
||||
ld1 {$S2,$R3,$S3,$R4},[x15],#64
|
||||
ld1 {$S4},[x15]
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov $IN01_0,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,$padbit,x12,lsr#40
|
||||
add x13,$padbit,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov $IN01_1,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
movi $MASK.2d,#-1
|
||||
fmov $IN01_2,x8
|
||||
fmov $IN01_3,x10
|
||||
fmov $IN01_4,x12
|
||||
ushr $MASK.2d,$MASK.2d,#38
|
||||
|
||||
b.ls .Lskip_loop
|
||||
|
||||
.align 4
|
||||
.Loop_neon:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
||||
// \___________________/
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
|
||||
// \___________________/ \____________________/
|
||||
//
|
||||
// Note that we start with inp[2:3]*r^2. This is because it
|
||||
// doesn't depend on reduction in previous iteration.
|
||||
////////////////////////////////////////////////////////////////
|
||||
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
|
||||
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
|
||||
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
|
||||
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
|
||||
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
|
||||
|
||||
subs $len,$len,#64
|
||||
umull $ACC4,$IN23_0,${R4}[2]
|
||||
csel $in2,$zeros,$in2,lo
|
||||
umull $ACC3,$IN23_0,${R3}[2]
|
||||
umull $ACC2,$IN23_0,${R2}[2]
|
||||
ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
|
||||
umull $ACC1,$IN23_0,${R1}[2]
|
||||
ldp x9,x13,[$in2],#48
|
||||
umull $ACC0,$IN23_0,${R0}[2]
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
umlal $ACC4,$IN23_1,${R3}[2]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal $ACC3,$IN23_1,${R2}[2]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal $ACC2,$IN23_1,${R1}[2]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal $ACC1,$IN23_1,${R0}[2]
|
||||
ubfx x7,x9,#26,#26
|
||||
umlal $ACC0,$IN23_1,${S4}[2]
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
|
||||
umlal $ACC4,$IN23_2,${R2}[2]
|
||||
extr x8,x12,x8,#52
|
||||
umlal $ACC3,$IN23_2,${R1}[2]
|
||||
extr x9,x13,x9,#52
|
||||
umlal $ACC2,$IN23_2,${R0}[2]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal $ACC1,$IN23_2,${S4}[2]
|
||||
fmov $IN23_0,x4
|
||||
umlal $ACC0,$IN23_2,${S3}[2]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
umlal $ACC4,$IN23_3,${R1}[2]
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal $ACC3,$IN23_3,${R0}[2]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal $ACC2,$IN23_3,${S4}[2]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal $ACC1,$IN23_3,${S3}[2]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal $ACC0,$IN23_3,${S2}[2]
|
||||
fmov $IN23_1,x6
|
||||
|
||||
add $IN01_2,$IN01_2,$H2
|
||||
add x12,$padbit,x12,lsr#40
|
||||
umlal $ACC4,$IN23_4,${R0}[2]
|
||||
add x13,$padbit,x13,lsr#40
|
||||
umlal $ACC3,$IN23_4,${S4}[2]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal $ACC2,$IN23_4,${S3}[2]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal $ACC1,$IN23_4,${S2}[2]
|
||||
fmov $IN23_2,x8
|
||||
umlal $ACC0,$IN23_4,${S1}[2]
|
||||
fmov $IN23_3,x10
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4 and accumulate
|
||||
|
||||
add $IN01_0,$IN01_0,$H0
|
||||
fmov $IN23_4,x12
|
||||
umlal $ACC3,$IN01_2,${R1}[0]
|
||||
ldp x8,x12,[$inp],#16 // inp[0:1]
|
||||
umlal $ACC0,$IN01_2,${S3}[0]
|
||||
ldp x9,x13,[$inp],#48
|
||||
umlal $ACC4,$IN01_2,${R2}[0]
|
||||
umlal $ACC1,$IN01_2,${S4}[0]
|
||||
umlal $ACC2,$IN01_2,${R0}[0]
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
add $IN01_1,$IN01_1,$H1
|
||||
umlal $ACC3,$IN01_0,${R3}[0]
|
||||
umlal $ACC4,$IN01_0,${R4}[0]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal $ACC2,$IN01_0,${R2}[0]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal $ACC0,$IN01_0,${R0}[0]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal $ACC1,$IN01_0,${R1}[0]
|
||||
ubfx x7,x9,#26,#26
|
||||
|
||||
add $IN01_3,$IN01_3,$H3
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
umlal $ACC3,$IN01_1,${R2}[0]
|
||||
extr x8,x12,x8,#52
|
||||
umlal $ACC4,$IN01_1,${R3}[0]
|
||||
extr x9,x13,x9,#52
|
||||
umlal $ACC0,$IN01_1,${S4}[0]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal $ACC2,$IN01_1,${R1}[0]
|
||||
fmov $IN01_0,x4
|
||||
umlal $ACC1,$IN01_1,${R0}[0]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
add $IN01_4,$IN01_4,$H4
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal $ACC3,$IN01_3,${R0}[0]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal $ACC0,$IN01_3,${S2}[0]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal $ACC4,$IN01_3,${R1}[0]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal $ACC1,$IN01_3,${S3}[0]
|
||||
fmov $IN01_1,x6
|
||||
umlal $ACC2,$IN01_3,${S4}[0]
|
||||
add x12,$padbit,x12,lsr#40
|
||||
|
||||
umlal $ACC3,$IN01_4,${S4}[0]
|
||||
add x13,$padbit,x13,lsr#40
|
||||
umlal $ACC0,$IN01_4,${S1}[0]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal $ACC4,$IN01_4,${R0}[0]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal $ACC1,$IN01_4,${S2}[0]
|
||||
fmov $IN01_2,x8
|
||||
umlal $ACC2,$IN01_4,${S3}[0]
|
||||
fmov $IN01_3,x10
|
||||
fmov $IN01_4,x12
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
||||
// and P. Schwabe
|
||||
//
|
||||
// [see discussion in poly1305-armv4 module]
|
||||
|
||||
ushr $T0.2d,$ACC3,#26
|
||||
xtn $H3,$ACC3
|
||||
ushr $T1.2d,$ACC0,#26
|
||||
and $ACC0,$ACC0,$MASK.2d
|
||||
add $ACC4,$ACC4,$T0.2d // h3 -> h4
|
||||
bic $H3,#0xfc,lsl#24 // &=0x03ffffff
|
||||
add $ACC1,$ACC1,$T1.2d // h0 -> h1
|
||||
|
||||
ushr $T0.2d,$ACC4,#26
|
||||
xtn $H4,$ACC4
|
||||
ushr $T1.2d,$ACC1,#26
|
||||
xtn $H1,$ACC1
|
||||
bic $H4,#0xfc,lsl#24
|
||||
add $ACC2,$ACC2,$T1.2d // h1 -> h2
|
||||
|
||||
add $ACC0,$ACC0,$T0.2d
|
||||
shl $T0.2d,$T0.2d,#2
|
||||
shrn $T1.2s,$ACC2,#26
|
||||
xtn $H2,$ACC2
|
||||
add $ACC0,$ACC0,$T0.2d // h4 -> h0
|
||||
bic $H1,#0xfc,lsl#24
|
||||
add $H3,$H3,$T1.2s // h2 -> h3
|
||||
bic $H2,#0xfc,lsl#24
|
||||
|
||||
shrn $T0.2s,$ACC0,#26
|
||||
xtn $H0,$ACC0
|
||||
ushr $T1.2s,$H3,#26
|
||||
bic $H3,#0xfc,lsl#24
|
||||
bic $H0,#0xfc,lsl#24
|
||||
add $H1,$H1,$T0.2s // h0 -> h1
|
||||
add $H4,$H4,$T1.2s // h3 -> h4
|
||||
|
||||
b.hi .Loop_neon
|
||||
|
||||
.Lskip_loop:
|
||||
dup $IN23_2,${IN23_2}[0]
|
||||
add $IN01_2,$IN01_2,$H2
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
|
||||
|
||||
adds $len,$len,#32
|
||||
b.ne .Long_tail
|
||||
|
||||
dup $IN23_2,${IN01_2}[0]
|
||||
add $IN23_0,$IN01_0,$H0
|
||||
add $IN23_3,$IN01_3,$H3
|
||||
add $IN23_1,$IN01_1,$H1
|
||||
add $IN23_4,$IN01_4,$H4
|
||||
|
||||
.Long_tail:
|
||||
dup $IN23_0,${IN23_0}[0]
|
||||
umull2 $ACC0,$IN23_2,${S3}
|
||||
umull2 $ACC3,$IN23_2,${R1}
|
||||
umull2 $ACC4,$IN23_2,${R2}
|
||||
umull2 $ACC2,$IN23_2,${R0}
|
||||
umull2 $ACC1,$IN23_2,${S4}
|
||||
|
||||
dup $IN23_1,${IN23_1}[0]
|
||||
umlal2 $ACC0,$IN23_0,${R0}
|
||||
umlal2 $ACC2,$IN23_0,${R2}
|
||||
umlal2 $ACC3,$IN23_0,${R3}
|
||||
umlal2 $ACC4,$IN23_0,${R4}
|
||||
umlal2 $ACC1,$IN23_0,${R1}
|
||||
|
||||
dup $IN23_3,${IN23_3}[0]
|
||||
umlal2 $ACC0,$IN23_1,${S4}
|
||||
umlal2 $ACC3,$IN23_1,${R2}
|
||||
umlal2 $ACC2,$IN23_1,${R1}
|
||||
umlal2 $ACC4,$IN23_1,${R3}
|
||||
umlal2 $ACC1,$IN23_1,${R0}
|
||||
|
||||
dup $IN23_4,${IN23_4}[0]
|
||||
umlal2 $ACC3,$IN23_3,${R0}
|
||||
umlal2 $ACC4,$IN23_3,${R1}
|
||||
umlal2 $ACC0,$IN23_3,${S2}
|
||||
umlal2 $ACC1,$IN23_3,${S3}
|
||||
umlal2 $ACC2,$IN23_3,${S4}
|
||||
|
||||
umlal2 $ACC3,$IN23_4,${S4}
|
||||
umlal2 $ACC0,$IN23_4,${S1}
|
||||
umlal2 $ACC4,$IN23_4,${R0}
|
||||
umlal2 $ACC1,$IN23_4,${S2}
|
||||
umlal2 $ACC2,$IN23_4,${S3}
|
||||
|
||||
b.eq .Lshort_tail
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4:r^3 and accumulate
|
||||
|
||||
add $IN01_0,$IN01_0,$H0
|
||||
umlal $ACC3,$IN01_2,${R1}
|
||||
umlal $ACC0,$IN01_2,${S3}
|
||||
umlal $ACC4,$IN01_2,${R2}
|
||||
umlal $ACC1,$IN01_2,${S4}
|
||||
umlal $ACC2,$IN01_2,${R0}
|
||||
|
||||
add $IN01_1,$IN01_1,$H1
|
||||
umlal $ACC3,$IN01_0,${R3}
|
||||
umlal $ACC0,$IN01_0,${R0}
|
||||
umlal $ACC4,$IN01_0,${R4}
|
||||
umlal $ACC1,$IN01_0,${R1}
|
||||
umlal $ACC2,$IN01_0,${R2}
|
||||
|
||||
add $IN01_3,$IN01_3,$H3
|
||||
umlal $ACC3,$IN01_1,${R2}
|
||||
umlal $ACC0,$IN01_1,${S4}
|
||||
umlal $ACC4,$IN01_1,${R3}
|
||||
umlal $ACC1,$IN01_1,${R0}
|
||||
umlal $ACC2,$IN01_1,${R1}
|
||||
|
||||
add $IN01_4,$IN01_4,$H4
|
||||
umlal $ACC3,$IN01_3,${R0}
|
||||
umlal $ACC0,$IN01_3,${S2}
|
||||
umlal $ACC4,$IN01_3,${R1}
|
||||
umlal $ACC1,$IN01_3,${S3}
|
||||
umlal $ACC2,$IN01_3,${S4}
|
||||
|
||||
umlal $ACC3,$IN01_4,${S4}
|
||||
umlal $ACC0,$IN01_4,${S1}
|
||||
umlal $ACC4,$IN01_4,${R0}
|
||||
umlal $ACC1,$IN01_4,${S2}
|
||||
umlal $ACC2,$IN01_4,${S3}
|
||||
|
||||
.Lshort_tail:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// horizontal add
|
||||
|
||||
addp $ACC3,$ACC3,$ACC3
|
||||
ldp d8,d9,[sp,#16] // meet ABI requirements
|
||||
addp $ACC0,$ACC0,$ACC0
|
||||
ldp d10,d11,[sp,#32]
|
||||
addp $ACC4,$ACC4,$ACC4
|
||||
ldp d12,d13,[sp,#48]
|
||||
addp $ACC1,$ACC1,$ACC1
|
||||
ldp d14,d15,[sp,#64]
|
||||
addp $ACC2,$ACC2,$ACC2
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// lazy reduction, but without narrowing
|
||||
|
||||
ushr $T0.2d,$ACC3,#26
|
||||
and $ACC3,$ACC3,$MASK.2d
|
||||
ushr $T1.2d,$ACC0,#26
|
||||
and $ACC0,$ACC0,$MASK.2d
|
||||
|
||||
add $ACC4,$ACC4,$T0.2d // h3 -> h4
|
||||
add $ACC1,$ACC1,$T1.2d // h0 -> h1
|
||||
|
||||
ushr $T0.2d,$ACC4,#26
|
||||
and $ACC4,$ACC4,$MASK.2d
|
||||
ushr $T1.2d,$ACC1,#26
|
||||
and $ACC1,$ACC1,$MASK.2d
|
||||
add $ACC2,$ACC2,$T1.2d // h1 -> h2
|
||||
|
||||
add $ACC0,$ACC0,$T0.2d
|
||||
shl $T0.2d,$T0.2d,#2
|
||||
ushr $T1.2d,$ACC2,#26
|
||||
and $ACC2,$ACC2,$MASK.2d
|
||||
add $ACC0,$ACC0,$T0.2d // h4 -> h0
|
||||
add $ACC3,$ACC3,$T1.2d // h2 -> h3
|
||||
|
||||
ushr $T0.2d,$ACC0,#26
|
||||
and $ACC0,$ACC0,$MASK.2d
|
||||
ushr $T1.2d,$ACC3,#26
|
||||
and $ACC3,$ACC3,$MASK.2d
|
||||
add $ACC1,$ACC1,$T0.2d // h0 -> h1
|
||||
add $ACC4,$ACC4,$T1.2d // h3 -> h4
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// write the result, can be partially reduced
|
||||
|
||||
st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
|
||||
mov x4,#1
|
||||
st1 {$ACC4}[0],[$ctx]
|
||||
str x4,[$ctx,#8] // set is_base2_26
|
||||
|
||||
ldr x29,[sp],#80
|
||||
.inst 0xd50323bf // autiasp
|
||||
ret
|
||||
.size poly1305_blocks_neon,.-poly1305_blocks_neon
|
||||
|
||||
.align 5
|
||||
.Lzeros:
|
||||
.long 0,0,0,0,0,0,0,0
|
||||
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
|
||||
.align 2
|
||||
#if !defined(__KERNEL__) && !defined(_WIN64)
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
.hidden OPENSSL_armcap_P
|
||||
#endif
|
||||
___
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
|
||||
s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
|
||||
(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
|
||||
(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
|
||||
(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
|
||||
(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
|
||||
(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
|
||||
|
||||
s/\.[124]([sd])\[/.$1\[/;
|
||||
s/w#x([0-9]+)/w$1/g;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT;
|
||||
835
arch/arm64/crypto/poly1305-core.S_shipped
Normal file
835
arch/arm64/crypto/poly1305-core.S_shipped
Normal file
@@ -0,0 +1,835 @@
|
||||
#ifndef __KERNEL__
|
||||
# include "arm_arch.h"
|
||||
.extern OPENSSL_armcap_P
|
||||
#endif
|
||||
|
||||
.text
|
||||
|
||||
// forward "declarations" are required for Apple
|
||||
.globl poly1305_blocks
|
||||
.globl poly1305_emit
|
||||
|
||||
.globl poly1305_init
|
||||
.type poly1305_init,%function
|
||||
.align 5
|
||||
poly1305_init:
|
||||
cmp x1,xzr
|
||||
stp xzr,xzr,[x0] // zero hash value
|
||||
stp xzr,xzr,[x0,#16] // [along with is_base2_26]
|
||||
|
||||
csel x0,xzr,x0,eq
|
||||
b.eq .Lno_key
|
||||
|
||||
#ifndef __KERNEL__
|
||||
adrp x17,OPENSSL_armcap_P
|
||||
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
|
||||
#endif
|
||||
|
||||
ldp x7,x8,[x1] // load key
|
||||
mov x9,#0xfffffffc0fffffff
|
||||
movk x9,#0x0fff,lsl#48
|
||||
#ifdef __AARCH64EB__
|
||||
rev x7,x7 // flip bytes
|
||||
rev x8,x8
|
||||
#endif
|
||||
and x7,x7,x9 // &=0ffffffc0fffffff
|
||||
and x9,x9,#-4
|
||||
and x8,x8,x9 // &=0ffffffc0ffffffc
|
||||
mov w9,#-1
|
||||
stp x7,x8,[x0,#32] // save key value
|
||||
str w9,[x0,#48] // impossible key power value
|
||||
|
||||
#ifndef __KERNEL__
|
||||
tst w17,#ARMV7_NEON
|
||||
|
||||
adr x12,.Lpoly1305_blocks
|
||||
adr x7,.Lpoly1305_blocks_neon
|
||||
adr x13,.Lpoly1305_emit
|
||||
|
||||
csel x12,x12,x7,eq
|
||||
|
||||
# ifdef __ILP32__
|
||||
stp w12,w13,[x2]
|
||||
# else
|
||||
stp x12,x13,[x2]
|
||||
# endif
|
||||
#endif
|
||||
mov x0,#1
|
||||
.Lno_key:
|
||||
ret
|
||||
.size poly1305_init,.-poly1305_init
|
||||
|
||||
.type poly1305_blocks,%function
|
||||
.align 5
|
||||
poly1305_blocks:
|
||||
.Lpoly1305_blocks:
|
||||
ands x2,x2,#-16
|
||||
b.eq .Lno_data
|
||||
|
||||
ldp x4,x5,[x0] // load hash value
|
||||
ldp x6,x17,[x0,#16] // [along with is_base2_26]
|
||||
ldp x7,x8,[x0,#32] // load key value
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
lsr x12,x4,#32
|
||||
mov w13,w4
|
||||
lsr x14,x5,#32
|
||||
mov w15,w5
|
||||
lsr x16,x6,#32
|
||||
#else
|
||||
mov w12,w4
|
||||
lsr x13,x4,#32
|
||||
mov w14,w5
|
||||
lsr x15,x5,#32
|
||||
mov w16,w6
|
||||
#endif
|
||||
|
||||
add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr x13,x14,#12
|
||||
adds x12,x12,x14,lsl#52
|
||||
add x13,x13,x15,lsl#14
|
||||
adc x13,x13,xzr
|
||||
lsr x14,x16,#24
|
||||
adds x13,x13,x16,lsl#40
|
||||
adc x14,x14,xzr
|
||||
|
||||
cmp x17,#0 // is_base2_26?
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
csel x4,x4,x12,eq // choose between radixes
|
||||
csel x5,x5,x13,eq
|
||||
csel x6,x6,x14,eq
|
||||
|
||||
.Loop:
|
||||
ldp x10,x11,[x1],#16 // load input
|
||||
sub x2,x2,#16
|
||||
#ifdef __AARCH64EB__
|
||||
rev x10,x10
|
||||
rev x11,x11
|
||||
#endif
|
||||
adds x4,x4,x10 // accumulate input
|
||||
adcs x5,x5,x11
|
||||
|
||||
mul x12,x4,x7 // h0*r0
|
||||
adc x6,x6,x3
|
||||
umulh x13,x4,x7
|
||||
|
||||
mul x10,x5,x9 // h1*5*r1
|
||||
umulh x11,x5,x9
|
||||
|
||||
adds x12,x12,x10
|
||||
mul x10,x4,x8 // h0*r1
|
||||
adc x13,x13,x11
|
||||
umulh x14,x4,x8
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x5,x7 // h1*r0
|
||||
adc x14,x14,xzr
|
||||
umulh x11,x5,x7
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x6,x9 // h2*5*r1
|
||||
adc x14,x14,x11
|
||||
mul x11,x6,x7 // h2*r0
|
||||
|
||||
adds x13,x13,x10
|
||||
adc x14,x14,x11
|
||||
|
||||
and x10,x14,#-4 // final reduction
|
||||
and x6,x14,#3
|
||||
add x10,x10,x14,lsr#2
|
||||
adds x4,x12,x10
|
||||
adcs x5,x13,xzr
|
||||
adc x6,x6,xzr
|
||||
|
||||
cbnz x2,.Loop
|
||||
|
||||
stp x4,x5,[x0] // store hash value
|
||||
stp x6,xzr,[x0,#16] // [and clear is_base2_26]
|
||||
|
||||
.Lno_data:
|
||||
ret
|
||||
.size poly1305_blocks,.-poly1305_blocks
|
||||
|
||||
.type poly1305_emit,%function
|
||||
.align 5
|
||||
poly1305_emit:
|
||||
.Lpoly1305_emit:
|
||||
ldp x4,x5,[x0] // load hash base 2^64
|
||||
ldp x6,x7,[x0,#16] // [along with is_base2_26]
|
||||
ldp x10,x11,[x2] // load nonce
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
lsr x12,x4,#32
|
||||
mov w13,w4
|
||||
lsr x14,x5,#32
|
||||
mov w15,w5
|
||||
lsr x16,x6,#32
|
||||
#else
|
||||
mov w12,w4
|
||||
lsr x13,x4,#32
|
||||
mov w14,w5
|
||||
lsr x15,x5,#32
|
||||
mov w16,w6
|
||||
#endif
|
||||
|
||||
add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr x13,x14,#12
|
||||
adds x12,x12,x14,lsl#52
|
||||
add x13,x13,x15,lsl#14
|
||||
adc x13,x13,xzr
|
||||
lsr x14,x16,#24
|
||||
adds x13,x13,x16,lsl#40
|
||||
adc x14,x14,xzr
|
||||
|
||||
cmp x7,#0 // is_base2_26?
|
||||
csel x4,x4,x12,eq // choose between radixes
|
||||
csel x5,x5,x13,eq
|
||||
csel x6,x6,x14,eq
|
||||
|
||||
adds x12,x4,#5 // compare to modulus
|
||||
adcs x13,x5,xzr
|
||||
adc x14,x6,xzr
|
||||
|
||||
tst x14,#-4 // see if it's carried/borrowed
|
||||
|
||||
csel x4,x4,x12,eq
|
||||
csel x5,x5,x13,eq
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
ror x10,x10,#32 // flip nonce words
|
||||
ror x11,x11,#32
|
||||
#endif
|
||||
adds x4,x4,x10 // accumulate nonce
|
||||
adc x5,x5,x11
|
||||
#ifdef __AARCH64EB__
|
||||
rev x4,x4 // flip output bytes
|
||||
rev x5,x5
|
||||
#endif
|
||||
stp x4,x5,[x1] // write result
|
||||
|
||||
ret
|
||||
.size poly1305_emit,.-poly1305_emit
|
||||
.type poly1305_mult,%function
|
||||
.align 5
|
||||
poly1305_mult:
|
||||
mul x12,x4,x7 // h0*r0
|
||||
umulh x13,x4,x7
|
||||
|
||||
mul x10,x5,x9 // h1*5*r1
|
||||
umulh x11,x5,x9
|
||||
|
||||
adds x12,x12,x10
|
||||
mul x10,x4,x8 // h0*r1
|
||||
adc x13,x13,x11
|
||||
umulh x14,x4,x8
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x5,x7 // h1*r0
|
||||
adc x14,x14,xzr
|
||||
umulh x11,x5,x7
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x6,x9 // h2*5*r1
|
||||
adc x14,x14,x11
|
||||
mul x11,x6,x7 // h2*r0
|
||||
|
||||
adds x13,x13,x10
|
||||
adc x14,x14,x11
|
||||
|
||||
and x10,x14,#-4 // final reduction
|
||||
and x6,x14,#3
|
||||
add x10,x10,x14,lsr#2
|
||||
adds x4,x12,x10
|
||||
adcs x5,x13,xzr
|
||||
adc x6,x6,xzr
|
||||
|
||||
ret
|
||||
.size poly1305_mult,.-poly1305_mult
|
||||
|
||||
.type poly1305_splat,%function
|
||||
.align 4
|
||||
poly1305_splat:
|
||||
and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x13,x4,#26,#26
|
||||
extr x14,x5,x4,#52
|
||||
and x14,x14,#0x03ffffff
|
||||
ubfx x15,x5,#14,#26
|
||||
extr x16,x6,x5,#40
|
||||
|
||||
str w12,[x0,#16*0] // r0
|
||||
add w12,w13,w13,lsl#2 // r1*5
|
||||
str w13,[x0,#16*1] // r1
|
||||
add w13,w14,w14,lsl#2 // r2*5
|
||||
str w12,[x0,#16*2] // s1
|
||||
str w14,[x0,#16*3] // r2
|
||||
add w14,w15,w15,lsl#2 // r3*5
|
||||
str w13,[x0,#16*4] // s2
|
||||
str w15,[x0,#16*5] // r3
|
||||
add w15,w16,w16,lsl#2 // r4*5
|
||||
str w14,[x0,#16*6] // s3
|
||||
str w16,[x0,#16*7] // r4
|
||||
str w15,[x0,#16*8] // s4
|
||||
|
||||
ret
|
||||
.size poly1305_splat,.-poly1305_splat
|
||||
|
||||
#ifdef __KERNEL__
|
||||
.globl poly1305_blocks_neon
|
||||
#endif
|
||||
.type poly1305_blocks_neon,%function
|
||||
.align 5
|
||||
poly1305_blocks_neon:
|
||||
.Lpoly1305_blocks_neon:
|
||||
ldr x17,[x0,#24]
|
||||
cmp x2,#128
|
||||
b.lo .Lpoly1305_blocks
|
||||
|
||||
.inst 0xd503233f // paciasp
|
||||
stp x29,x30,[sp,#-80]!
|
||||
add x29,sp,#0
|
||||
|
||||
stp d8,d9,[sp,#16] // meet ABI requirements
|
||||
stp d10,d11,[sp,#32]
|
||||
stp d12,d13,[sp,#48]
|
||||
stp d14,d15,[sp,#64]
|
||||
|
||||
cbz x17,.Lbase2_64_neon
|
||||
|
||||
ldp w10,w11,[x0] // load hash value base 2^26
|
||||
ldp w12,w13,[x0,#8]
|
||||
ldr w14,[x0,#16]
|
||||
|
||||
tst x2,#31
|
||||
b.eq .Leven_neon
|
||||
|
||||
ldp x7,x8,[x0,#32] // load key value
|
||||
|
||||
add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr x5,x12,#12
|
||||
adds x4,x4,x12,lsl#52
|
||||
add x5,x5,x13,lsl#14
|
||||
adc x5,x5,xzr
|
||||
lsr x6,x14,#24
|
||||
adds x5,x5,x14,lsl#40
|
||||
adc x14,x6,xzr // can be partially reduced...
|
||||
|
||||
ldp x12,x13,[x1],#16 // load input
|
||||
sub x2,x2,#16
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x12,x12
|
||||
rev x13,x13
|
||||
#endif
|
||||
adds x4,x4,x12 // accumulate input
|
||||
adcs x5,x5,x13
|
||||
adc x6,x6,x3
|
||||
|
||||
bl poly1305_mult
|
||||
|
||||
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,x4,#26,#26
|
||||
extr x12,x5,x4,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,x5,#14,#26
|
||||
extr x14,x6,x5,#40
|
||||
|
||||
b .Leven_neon
|
||||
|
||||
.align 4
|
||||
.Lbase2_64_neon:
|
||||
ldp x7,x8,[x0,#32] // load key value
|
||||
|
||||
ldp x4,x5,[x0] // load hash value base 2^64
|
||||
ldr x6,[x0,#16]
|
||||
|
||||
tst x2,#31
|
||||
b.eq .Linit_neon
|
||||
|
||||
ldp x12,x13,[x1],#16 // load input
|
||||
sub x2,x2,#16
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
#ifdef __AARCH64EB__
|
||||
rev x12,x12
|
||||
rev x13,x13
|
||||
#endif
|
||||
adds x4,x4,x12 // accumulate input
|
||||
adcs x5,x5,x13
|
||||
adc x6,x6,x3
|
||||
|
||||
bl poly1305_mult
|
||||
|
||||
.Linit_neon:
|
||||
ldr w17,[x0,#48] // first table element
|
||||
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,x4,#26,#26
|
||||
extr x12,x5,x4,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,x5,#14,#26
|
||||
extr x14,x6,x5,#40
|
||||
|
||||
cmp w17,#-1 // is value impossible?
|
||||
b.ne .Leven_neon
|
||||
|
||||
fmov d24,x10
|
||||
fmov d25,x11
|
||||
fmov d26,x12
|
||||
fmov d27,x13
|
||||
fmov d28,x14
|
||||
|
||||
////////////////////////////////// initialize r^n table
|
||||
mov x4,x7 // r^1
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
mov x5,x8
|
||||
mov x6,xzr
|
||||
add x0,x0,#48+12
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^2
|
||||
sub x0,x0,#4
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^3
|
||||
sub x0,x0,#4
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^4
|
||||
sub x0,x0,#4
|
||||
bl poly1305_splat
|
||||
sub x0,x0,#48 // restore original x0
|
||||
b .Ldo_neon
|
||||
|
||||
.align 4
|
||||
.Leven_neon:
|
||||
fmov d24,x10
|
||||
fmov d25,x11
|
||||
fmov d26,x12
|
||||
fmov d27,x13
|
||||
fmov d28,x14
|
||||
|
||||
.Ldo_neon:
|
||||
ldp x8,x12,[x1,#32] // inp[2:3]
|
||||
subs x2,x2,#64
|
||||
ldp x9,x13,[x1,#48]
|
||||
add x16,x1,#96
|
||||
adr x17,.Lzeros
|
||||
|
||||
lsl x3,x3,#24
|
||||
add x15,x0,#48
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov d14,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,x3,x12,lsr#40
|
||||
add x13,x3,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov d15,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
fmov d16,x8
|
||||
fmov d17,x10
|
||||
fmov d18,x12
|
||||
|
||||
ldp x8,x12,[x1],#16 // inp[0:1]
|
||||
ldp x9,x13,[x1],#48
|
||||
|
||||
ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
|
||||
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
|
||||
ld1 {v8.4s},[x15]
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov d9,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,x3,x12,lsr#40
|
||||
add x13,x3,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov d10,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
movi v31.2d,#-1
|
||||
fmov d11,x8
|
||||
fmov d12,x10
|
||||
fmov d13,x12
|
||||
ushr v31.2d,v31.2d,#38
|
||||
|
||||
b.ls .Lskip_loop
|
||||
|
||||
.align 4
|
||||
.Loop_neon:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
||||
// ___________________/
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
|
||||
// ___________________/ ____________________/
|
||||
//
|
||||
// Note that we start with inp[2:3]*r^2. This is because it
|
||||
// doesn't depend on reduction in previous iteration.
|
||||
////////////////////////////////////////////////////////////////
|
||||
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
|
||||
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
|
||||
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
|
||||
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
|
||||
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
|
||||
|
||||
subs x2,x2,#64
|
||||
umull v23.2d,v14.2s,v7.s[2]
|
||||
csel x16,x17,x16,lo
|
||||
umull v22.2d,v14.2s,v5.s[2]
|
||||
umull v21.2d,v14.2s,v3.s[2]
|
||||
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
|
||||
umull v20.2d,v14.2s,v1.s[2]
|
||||
ldp x9,x13,[x16],#48
|
||||
umull v19.2d,v14.2s,v0.s[2]
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
umlal v23.2d,v15.2s,v5.s[2]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal v22.2d,v15.2s,v3.s[2]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal v21.2d,v15.2s,v1.s[2]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal v20.2d,v15.2s,v0.s[2]
|
||||
ubfx x7,x9,#26,#26
|
||||
umlal v19.2d,v15.2s,v8.s[2]
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
|
||||
umlal v23.2d,v16.2s,v3.s[2]
|
||||
extr x8,x12,x8,#52
|
||||
umlal v22.2d,v16.2s,v1.s[2]
|
||||
extr x9,x13,x9,#52
|
||||
umlal v21.2d,v16.2s,v0.s[2]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal v20.2d,v16.2s,v8.s[2]
|
||||
fmov d14,x4
|
||||
umlal v19.2d,v16.2s,v6.s[2]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
umlal v23.2d,v17.2s,v1.s[2]
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal v22.2d,v17.2s,v0.s[2]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal v21.2d,v17.2s,v8.s[2]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal v20.2d,v17.2s,v6.s[2]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal v19.2d,v17.2s,v4.s[2]
|
||||
fmov d15,x6
|
||||
|
||||
add v11.2s,v11.2s,v26.2s
|
||||
add x12,x3,x12,lsr#40
|
||||
umlal v23.2d,v18.2s,v0.s[2]
|
||||
add x13,x3,x13,lsr#40
|
||||
umlal v22.2d,v18.2s,v8.s[2]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal v21.2d,v18.2s,v6.s[2]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal v20.2d,v18.2s,v4.s[2]
|
||||
fmov d16,x8
|
||||
umlal v19.2d,v18.2s,v2.s[2]
|
||||
fmov d17,x10
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4 and accumulate
|
||||
|
||||
add v9.2s,v9.2s,v24.2s
|
||||
fmov d18,x12
|
||||
umlal v22.2d,v11.2s,v1.s[0]
|
||||
ldp x8,x12,[x1],#16 // inp[0:1]
|
||||
umlal v19.2d,v11.2s,v6.s[0]
|
||||
ldp x9,x13,[x1],#48
|
||||
umlal v23.2d,v11.2s,v3.s[0]
|
||||
umlal v20.2d,v11.2s,v8.s[0]
|
||||
umlal v21.2d,v11.2s,v0.s[0]
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
add v10.2s,v10.2s,v25.2s
|
||||
umlal v22.2d,v9.2s,v5.s[0]
|
||||
umlal v23.2d,v9.2s,v7.s[0]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal v21.2d,v9.2s,v3.s[0]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal v19.2d,v9.2s,v0.s[0]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal v20.2d,v9.2s,v1.s[0]
|
||||
ubfx x7,x9,#26,#26
|
||||
|
||||
add v12.2s,v12.2s,v27.2s
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
umlal v22.2d,v10.2s,v3.s[0]
|
||||
extr x8,x12,x8,#52
|
||||
umlal v23.2d,v10.2s,v5.s[0]
|
||||
extr x9,x13,x9,#52
|
||||
umlal v19.2d,v10.2s,v8.s[0]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal v21.2d,v10.2s,v1.s[0]
|
||||
fmov d9,x4
|
||||
umlal v20.2d,v10.2s,v0.s[0]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
add v13.2s,v13.2s,v28.2s
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal v22.2d,v12.2s,v0.s[0]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal v19.2d,v12.2s,v4.s[0]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal v23.2d,v12.2s,v1.s[0]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal v20.2d,v12.2s,v6.s[0]
|
||||
fmov d10,x6
|
||||
umlal v21.2d,v12.2s,v8.s[0]
|
||||
add x12,x3,x12,lsr#40
|
||||
|
||||
umlal v22.2d,v13.2s,v8.s[0]
|
||||
add x13,x3,x13,lsr#40
|
||||
umlal v19.2d,v13.2s,v2.s[0]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal v23.2d,v13.2s,v0.s[0]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal v20.2d,v13.2s,v4.s[0]
|
||||
fmov d11,x8
|
||||
umlal v21.2d,v13.2s,v6.s[0]
|
||||
fmov d12,x10
|
||||
fmov d13,x12
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
||||
// and P. Schwabe
|
||||
//
|
||||
// [see discussion in poly1305-armv4 module]
|
||||
|
||||
ushr v29.2d,v22.2d,#26
|
||||
xtn v27.2s,v22.2d
|
||||
ushr v30.2d,v19.2d,#26
|
||||
and v19.16b,v19.16b,v31.16b
|
||||
add v23.2d,v23.2d,v29.2d // h3 -> h4
|
||||
bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
|
||||
add v20.2d,v20.2d,v30.2d // h0 -> h1
|
||||
|
||||
ushr v29.2d,v23.2d,#26
|
||||
xtn v28.2s,v23.2d
|
||||
ushr v30.2d,v20.2d,#26
|
||||
xtn v25.2s,v20.2d
|
||||
bic v28.2s,#0xfc,lsl#24
|
||||
add v21.2d,v21.2d,v30.2d // h1 -> h2
|
||||
|
||||
add v19.2d,v19.2d,v29.2d
|
||||
shl v29.2d,v29.2d,#2
|
||||
shrn v30.2s,v21.2d,#26
|
||||
xtn v26.2s,v21.2d
|
||||
add v19.2d,v19.2d,v29.2d // h4 -> h0
|
||||
bic v25.2s,#0xfc,lsl#24
|
||||
add v27.2s,v27.2s,v30.2s // h2 -> h3
|
||||
bic v26.2s,#0xfc,lsl#24
|
||||
|
||||
shrn v29.2s,v19.2d,#26
|
||||
xtn v24.2s,v19.2d
|
||||
ushr v30.2s,v27.2s,#26
|
||||
bic v27.2s,#0xfc,lsl#24
|
||||
bic v24.2s,#0xfc,lsl#24
|
||||
add v25.2s,v25.2s,v29.2s // h0 -> h1
|
||||
add v28.2s,v28.2s,v30.2s // h3 -> h4
|
||||
|
||||
b.hi .Loop_neon
|
||||
|
||||
.Lskip_loop:
|
||||
dup v16.2d,v16.d[0]
|
||||
add v11.2s,v11.2s,v26.2s
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
|
||||
|
||||
adds x2,x2,#32
|
||||
b.ne .Long_tail
|
||||
|
||||
dup v16.2d,v11.d[0]
|
||||
add v14.2s,v9.2s,v24.2s
|
||||
add v17.2s,v12.2s,v27.2s
|
||||
add v15.2s,v10.2s,v25.2s
|
||||
add v18.2s,v13.2s,v28.2s
|
||||
|
||||
.Long_tail:
|
||||
dup v14.2d,v14.d[0]
|
||||
umull2 v19.2d,v16.4s,v6.4s
|
||||
umull2 v22.2d,v16.4s,v1.4s
|
||||
umull2 v23.2d,v16.4s,v3.4s
|
||||
umull2 v21.2d,v16.4s,v0.4s
|
||||
umull2 v20.2d,v16.4s,v8.4s
|
||||
|
||||
dup v15.2d,v15.d[0]
|
||||
umlal2 v19.2d,v14.4s,v0.4s
|
||||
umlal2 v21.2d,v14.4s,v3.4s
|
||||
umlal2 v22.2d,v14.4s,v5.4s
|
||||
umlal2 v23.2d,v14.4s,v7.4s
|
||||
umlal2 v20.2d,v14.4s,v1.4s
|
||||
|
||||
dup v17.2d,v17.d[0]
|
||||
umlal2 v19.2d,v15.4s,v8.4s
|
||||
umlal2 v22.2d,v15.4s,v3.4s
|
||||
umlal2 v21.2d,v15.4s,v1.4s
|
||||
umlal2 v23.2d,v15.4s,v5.4s
|
||||
umlal2 v20.2d,v15.4s,v0.4s
|
||||
|
||||
dup v18.2d,v18.d[0]
|
||||
umlal2 v22.2d,v17.4s,v0.4s
|
||||
umlal2 v23.2d,v17.4s,v1.4s
|
||||
umlal2 v19.2d,v17.4s,v4.4s
|
||||
umlal2 v20.2d,v17.4s,v6.4s
|
||||
umlal2 v21.2d,v17.4s,v8.4s
|
||||
|
||||
umlal2 v22.2d,v18.4s,v8.4s
|
||||
umlal2 v19.2d,v18.4s,v2.4s
|
||||
umlal2 v23.2d,v18.4s,v0.4s
|
||||
umlal2 v20.2d,v18.4s,v4.4s
|
||||
umlal2 v21.2d,v18.4s,v6.4s
|
||||
|
||||
b.eq .Lshort_tail
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4:r^3 and accumulate
|
||||
|
||||
add v9.2s,v9.2s,v24.2s
|
||||
umlal v22.2d,v11.2s,v1.2s
|
||||
umlal v19.2d,v11.2s,v6.2s
|
||||
umlal v23.2d,v11.2s,v3.2s
|
||||
umlal v20.2d,v11.2s,v8.2s
|
||||
umlal v21.2d,v11.2s,v0.2s
|
||||
|
||||
add v10.2s,v10.2s,v25.2s
|
||||
umlal v22.2d,v9.2s,v5.2s
|
||||
umlal v19.2d,v9.2s,v0.2s
|
||||
umlal v23.2d,v9.2s,v7.2s
|
||||
umlal v20.2d,v9.2s,v1.2s
|
||||
umlal v21.2d,v9.2s,v3.2s
|
||||
|
||||
add v12.2s,v12.2s,v27.2s
|
||||
umlal v22.2d,v10.2s,v3.2s
|
||||
umlal v19.2d,v10.2s,v8.2s
|
||||
umlal v23.2d,v10.2s,v5.2s
|
||||
umlal v20.2d,v10.2s,v0.2s
|
||||
umlal v21.2d,v10.2s,v1.2s
|
||||
|
||||
add v13.2s,v13.2s,v28.2s
|
||||
umlal v22.2d,v12.2s,v0.2s
|
||||
umlal v19.2d,v12.2s,v4.2s
|
||||
umlal v23.2d,v12.2s,v1.2s
|
||||
umlal v20.2d,v12.2s,v6.2s
|
||||
umlal v21.2d,v12.2s,v8.2s
|
||||
|
||||
umlal v22.2d,v13.2s,v8.2s
|
||||
umlal v19.2d,v13.2s,v2.2s
|
||||
umlal v23.2d,v13.2s,v0.2s
|
||||
umlal v20.2d,v13.2s,v4.2s
|
||||
umlal v21.2d,v13.2s,v6.2s
|
||||
|
||||
.Lshort_tail:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// horizontal add
|
||||
|
||||
addp v22.2d,v22.2d,v22.2d
|
||||
ldp d8,d9,[sp,#16] // meet ABI requirements
|
||||
addp v19.2d,v19.2d,v19.2d
|
||||
ldp d10,d11,[sp,#32]
|
||||
addp v23.2d,v23.2d,v23.2d
|
||||
ldp d12,d13,[sp,#48]
|
||||
addp v20.2d,v20.2d,v20.2d
|
||||
ldp d14,d15,[sp,#64]
|
||||
addp v21.2d,v21.2d,v21.2d
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// lazy reduction, but without narrowing
|
||||
|
||||
ushr v29.2d,v22.2d,#26
|
||||
and v22.16b,v22.16b,v31.16b
|
||||
ushr v30.2d,v19.2d,#26
|
||||
and v19.16b,v19.16b,v31.16b
|
||||
|
||||
add v23.2d,v23.2d,v29.2d // h3 -> h4
|
||||
add v20.2d,v20.2d,v30.2d // h0 -> h1
|
||||
|
||||
ushr v29.2d,v23.2d,#26
|
||||
and v23.16b,v23.16b,v31.16b
|
||||
ushr v30.2d,v20.2d,#26
|
||||
and v20.16b,v20.16b,v31.16b
|
||||
add v21.2d,v21.2d,v30.2d // h1 -> h2
|
||||
|
||||
add v19.2d,v19.2d,v29.2d
|
||||
shl v29.2d,v29.2d,#2
|
||||
ushr v30.2d,v21.2d,#26
|
||||
and v21.16b,v21.16b,v31.16b
|
||||
add v19.2d,v19.2d,v29.2d // h4 -> h0
|
||||
add v22.2d,v22.2d,v30.2d // h2 -> h3
|
||||
|
||||
ushr v29.2d,v19.2d,#26
|
||||
and v19.16b,v19.16b,v31.16b
|
||||
ushr v30.2d,v22.2d,#26
|
||||
and v22.16b,v22.16b,v31.16b
|
||||
add v20.2d,v20.2d,v29.2d // h0 -> h1
|
||||
add v23.2d,v23.2d,v30.2d // h3 -> h4
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// write the result, can be partially reduced
|
||||
|
||||
st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
|
||||
mov x4,#1
|
||||
st1 {v23.s}[0],[x0]
|
||||
str x4,[x0,#8] // set is_base2_26
|
||||
|
||||
ldr x29,[sp],#80
|
||||
.inst 0xd50323bf // autiasp
|
||||
ret
|
||||
.size poly1305_blocks_neon,.-poly1305_blocks_neon
|
||||
|
||||
.align 5
|
||||
.Lzeros:
|
||||
.long 0,0,0,0,0,0,0,0
|
||||
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
|
||||
.align 2
|
||||
#if !defined(__KERNEL__) && !defined(_WIN64)
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
.hidden OPENSSL_armcap_P
|
||||
#endif
|
||||
230
arch/arm64/crypto/poly1305-glue.c
Normal file
230
arch/arm64/crypto/poly1305-glue.c
Normal file
@@ -0,0 +1,230 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
|
||||
*
|
||||
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||
*/
|
||||
|
||||
#include <asm/hwcap.h>
|
||||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/internal/poly1305.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
|
||||
asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
|
||||
asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
|
||||
asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
|
||||
|
||||
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
|
||||
{
|
||||
poly1305_init_arm64(&dctx->h, key);
|
||||
dctx->s[0] = get_unaligned_le32(key + 16);
|
||||
dctx->s[1] = get_unaligned_le32(key + 20);
|
||||
dctx->s[2] = get_unaligned_le32(key + 24);
|
||||
dctx->s[3] = get_unaligned_le32(key + 28);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_init_arch);
|
||||
|
||||
static int neon_poly1305_init(struct shash_desc *desc)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
dctx->buflen = 0;
|
||||
dctx->rset = 0;
|
||||
dctx->sset = false;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
u32 len, u32 hibit, bool do_neon)
|
||||
{
|
||||
if (unlikely(!dctx->sset)) {
|
||||
if (!dctx->rset) {
|
||||
poly1305_init_arch(dctx, src);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
dctx->rset = 1;
|
||||
}
|
||||
if (len >= POLY1305_BLOCK_SIZE) {
|
||||
dctx->s[0] = get_unaligned_le32(src + 0);
|
||||
dctx->s[1] = get_unaligned_le32(src + 4);
|
||||
dctx->s[2] = get_unaligned_le32(src + 8);
|
||||
dctx->s[3] = get_unaligned_le32(src + 12);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
dctx->sset = true;
|
||||
}
|
||||
if (len < POLY1305_BLOCK_SIZE)
|
||||
return;
|
||||
}
|
||||
|
||||
len &= ~(POLY1305_BLOCK_SIZE - 1);
|
||||
|
||||
if (static_branch_likely(&have_neon) && likely(do_neon))
|
||||
poly1305_blocks_neon(&dctx->h, src, len, hibit);
|
||||
else
|
||||
poly1305_blocks(&dctx->h, src, len, hibit);
|
||||
}
|
||||
|
||||
static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
|
||||
const u8 *src, u32 len, bool do_neon)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
|
||||
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||
src += bytes;
|
||||
len -= bytes;
|
||||
dctx->buflen += bytes;
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
neon_poly1305_blocks(dctx, dctx->buf,
|
||||
POLY1305_BLOCK_SIZE, 1, false);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(len >= POLY1305_BLOCK_SIZE)) {
|
||||
neon_poly1305_blocks(dctx, src, len, 1, do_neon);
|
||||
src += round_down(len, POLY1305_BLOCK_SIZE);
|
||||
len %= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (unlikely(len)) {
|
||||
dctx->buflen = len;
|
||||
memcpy(dctx->buf, src, len);
|
||||
}
|
||||
}
|
||||
|
||||
static int neon_poly1305_update(struct shash_desc *desc,
|
||||
const u8 *src, unsigned int srclen)
|
||||
{
|
||||
bool do_neon = may_use_simd() && srclen > 128;
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
if (static_branch_likely(&have_neon) && do_neon)
|
||||
kernel_neon_begin();
|
||||
neon_poly1305_do_update(dctx, src, srclen, do_neon);
|
||||
if (static_branch_likely(&have_neon) && do_neon)
|
||||
kernel_neon_end();
|
||||
return 0;
|
||||
}
|
||||
|
||||
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
unsigned int nbytes)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
|
||||
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||
src += bytes;
|
||||
nbytes -= bytes;
|
||||
dctx->buflen += bytes;
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
|
||||
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
|
||||
|
||||
if (static_branch_likely(&have_neon) && may_use_simd()) {
|
||||
do {
|
||||
unsigned int todo = min_t(unsigned int, len, SZ_4K);
|
||||
|
||||
kernel_neon_begin();
|
||||
poly1305_blocks_neon(&dctx->h, src, todo, 1);
|
||||
kernel_neon_end();
|
||||
|
||||
len -= todo;
|
||||
src += todo;
|
||||
} while (len);
|
||||
} else {
|
||||
poly1305_blocks(&dctx->h, src, len, 1);
|
||||
src += len;
|
||||
}
|
||||
nbytes %= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (unlikely(nbytes)) {
|
||||
dctx->buflen = nbytes;
|
||||
memcpy(dctx->buf, src, nbytes);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_update_arch);
|
||||
|
||||
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
dctx->buf[dctx->buflen++] = 1;
|
||||
memset(dctx->buf + dctx->buflen, 0,
|
||||
POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
|
||||
}
|
||||
|
||||
poly1305_emit(&dctx->h, dst, dctx->s);
|
||||
*dctx = (struct poly1305_desc_ctx){};
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_final_arch);
|
||||
|
||||
static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
if (unlikely(!dctx->sset))
|
||||
return -ENOKEY;
|
||||
|
||||
poly1305_final_arch(dctx, dst);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg neon_poly1305_alg = {
|
||||
.init = neon_poly1305_init,
|
||||
.update = neon_poly1305_update,
|
||||
.final = neon_poly1305_final,
|
||||
.digestsize = POLY1305_DIGEST_SIZE,
|
||||
.descsize = sizeof(struct poly1305_desc_ctx),
|
||||
|
||||
.base.cra_name = "poly1305",
|
||||
.base.cra_driver_name = "poly1305-neon",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init neon_poly1305_mod_init(void)
|
||||
{
|
||||
if (!(elf_hwcap & HWCAP_ASIMD))
|
||||
return 0;
|
||||
|
||||
static_branch_enable(&have_neon);
|
||||
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||
crypto_register_shash(&neon_poly1305_alg) : 0;
|
||||
}
|
||||
|
||||
static void __exit neon_poly1305_mod_exit(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && (elf_hwcap & HWCAP_ASIMD))
|
||||
crypto_unregister_shash(&neon_poly1305_alg);
|
||||
}
|
||||
|
||||
module_init(neon_poly1305_mod_init);
|
||||
module_exit(neon_poly1305_mod_exit);
|
||||
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_ALIAS_CRYPTO("poly1305");
|
||||
MODULE_ALIAS_CRYPTO("poly1305-neon");
|
||||
@@ -22,16 +22,21 @@ endif
|
||||
|
||||
CC_COMPAT ?= $(CC)
|
||||
CC_COMPAT += $(CC_COMPAT_CLANG_FLAGS)
|
||||
|
||||
ifeq ($(LLVM),1)
|
||||
LD_COMPAT ?= $(LD)
|
||||
else
|
||||
LD_COMPAT ?= $(CROSS_COMPILE_COMPAT)ld
|
||||
endif
|
||||
else
|
||||
CC_COMPAT ?= $(CROSS_COMPILE_COMPAT)gcc
|
||||
LD_COMPAT ?= $(CROSS_COMPILE_COMPAT)ld
|
||||
endif
|
||||
|
||||
cc32-option = $(call try-run,\
|
||||
$(CC_COMPAT) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
|
||||
cc32-disable-warning = $(call try-run,\
|
||||
$(CC_COMPAT) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1)))
|
||||
cc32-ldoption = $(call try-run,\
|
||||
$(CC_COMPAT) $(1) -nostdlib -x c /dev/null -o "$$TMP",$(1),$(2))
|
||||
cc32-as-instr = $(call try-run,\
|
||||
printf "%b\n" "$(1)" | $(CC_COMPAT) $(VDSO_AFLAGS) -c -x assembler -o "$$TMP" -,$(2),$(3))
|
||||
|
||||
@@ -114,14 +119,10 @@ dmbinstr := $(call cc32-as-instr,dmb ishld,-DCONFIG_AS_DMB_ISHLD=1)
|
||||
VDSO_CFLAGS += $(dmbinstr)
|
||||
VDSO_AFLAGS += $(dmbinstr)
|
||||
|
||||
VDSO_LDFLAGS := $(VDSO_CPPFLAGS)
|
||||
# From arm vDSO Makefile
|
||||
VDSO_LDFLAGS += -Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1
|
||||
VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
|
||||
VDSO_LDFLAGS += -nostdlib -shared -mfloat-abi=soft
|
||||
VDSO_LDFLAGS += $(call cc32-ldoption,-Wl$(comma)--hash-style=sysv)
|
||||
VDSO_LDFLAGS += $(call cc32-ldoption,-Wl$(comma)--build-id)
|
||||
VDSO_LDFLAGS += $(call cc32-ldoption,-fuse-ld=bfd)
|
||||
VDSO_LDFLAGS += -Bsymbolic --no-undefined -soname=linux-vdso.so.1
|
||||
VDSO_LDFLAGS += -z max-page-size=4096 -z common-page-size=4096
|
||||
VDSO_LDFLAGS += -nostdlib -shared --hash-style=sysv --build-id
|
||||
|
||||
|
||||
# Borrow vdsomunge.c from the arm vDSO
|
||||
@@ -182,8 +183,8 @@ quiet_cmd_vdsold_and_vdso_check = LD32 $@
|
||||
cmd_vdsold_and_vdso_check = $(cmd_vdsold); $(cmd_vdso_check)
|
||||
|
||||
quiet_cmd_vdsold = LD32 $@
|
||||
cmd_vdsold = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_LDFLAGS) \
|
||||
-Wl,-T $(filter %.lds,$^) $(filter %.o,$^) -o $@
|
||||
cmd_vdsold = $(LD_COMPAT) $(VDSO_LDFLAGS) \
|
||||
-T $(filter %.lds,$^) $(filter %.o,$^) -o $@
|
||||
quiet_cmd_vdsocc = CC32 $@
|
||||
cmd_vdsocc = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_CFLAGS) -c -o $@ $<
|
||||
quiet_cmd_vdsocc_gettimeofday = CC32 $@
|
||||
|
||||
@@ -499,7 +499,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg)
|
||||
if (map_start < map_end)
|
||||
memmap_init_zone((unsigned long)(map_end - map_start),
|
||||
args->nid, args->zone, page_to_pfn(map_start),
|
||||
MEMMAP_EARLY, NULL);
|
||||
MEMINIT_EARLY, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -508,8 +508,8 @@ memmap_init (unsigned long size, int nid, unsigned long zone,
|
||||
unsigned long start_pfn)
|
||||
{
|
||||
if (!vmem_map) {
|
||||
memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY,
|
||||
NULL);
|
||||
memmap_init_zone(size, nid, zone, start_pfn,
|
||||
MEMINIT_EARLY, NULL);
|
||||
} else {
|
||||
struct page *start;
|
||||
struct memmap_init_callback_data args;
|
||||
|
||||
@@ -339,7 +339,7 @@ libs-y += arch/mips/math-emu/
|
||||
# See arch/mips/Kbuild for content of core part of the kernel
|
||||
core-y += arch/mips/
|
||||
|
||||
drivers-$(CONFIG_MIPS_CRC_SUPPORT) += arch/mips/crypto/
|
||||
drivers-y += arch/mips/crypto/
|
||||
drivers-$(CONFIG_OPROFILE) += arch/mips/oprofile/
|
||||
|
||||
# suspend and hibernation support
|
||||
|
||||
@@ -4,3 +4,21 @@
|
||||
#
|
||||
|
||||
obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
|
||||
chacha-mips-y := chacha-core.o chacha-glue.o
|
||||
AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
|
||||
|
||||
obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
|
||||
poly1305-mips-y := poly1305-core.o poly1305-glue.o
|
||||
|
||||
perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
|
||||
perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
|
||||
|
||||
quiet_cmd_perlasm = PERLASM $@
|
||||
cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
|
||||
|
||||
$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
|
||||
$(call if_changed,perlasm)
|
||||
|
||||
targets += poly1305-core.S
|
||||
|
||||
497
arch/mips/crypto/chacha-core.S
Normal file
497
arch/mips/crypto/chacha-core.S
Normal file
@@ -0,0 +1,497 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
|
||||
/*
|
||||
* Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
|
||||
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||
*/
|
||||
|
||||
#define MASK_U32 0x3c
|
||||
#define CHACHA20_BLOCK_SIZE 64
|
||||
#define STACK_SIZE 32
|
||||
|
||||
#define X0 $t0
|
||||
#define X1 $t1
|
||||
#define X2 $t2
|
||||
#define X3 $t3
|
||||
#define X4 $t4
|
||||
#define X5 $t5
|
||||
#define X6 $t6
|
||||
#define X7 $t7
|
||||
#define X8 $t8
|
||||
#define X9 $t9
|
||||
#define X10 $v1
|
||||
#define X11 $s6
|
||||
#define X12 $s5
|
||||
#define X13 $s4
|
||||
#define X14 $s3
|
||||
#define X15 $s2
|
||||
/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
|
||||
#define T0 $s1
|
||||
#define T1 $s0
|
||||
#define T(n) T ## n
|
||||
#define X(n) X ## n
|
||||
|
||||
/* Input arguments */
|
||||
#define STATE $a0
|
||||
#define OUT $a1
|
||||
#define IN $a2
|
||||
#define BYTES $a3
|
||||
|
||||
/* Output argument */
|
||||
/* NONCE[0] is kept in a register and not in memory.
|
||||
* We don't want to touch original value in memory.
|
||||
* Must be incremented every loop iteration.
|
||||
*/
|
||||
#define NONCE_0 $v0
|
||||
|
||||
/* SAVED_X and SAVED_CA are set in the jump table.
|
||||
* Use regs which are overwritten on exit else we don't leak clear data.
|
||||
* They are used to handling the last bytes which are not multiple of 4.
|
||||
*/
|
||||
#define SAVED_X X15
|
||||
#define SAVED_CA $s7
|
||||
|
||||
#define IS_UNALIGNED $s7
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
#define MSB 0
|
||||
#define LSB 3
|
||||
#define ROTx rotl
|
||||
#define ROTR(n) rotr n, 24
|
||||
#define CPU_TO_LE32(n) \
|
||||
wsbh n; \
|
||||
rotr n, 16;
|
||||
#else
|
||||
#define MSB 3
|
||||
#define LSB 0
|
||||
#define ROTx rotr
|
||||
#define CPU_TO_LE32(n)
|
||||
#define ROTR(n)
|
||||
#endif
|
||||
|
||||
#define FOR_EACH_WORD(x) \
|
||||
x( 0); \
|
||||
x( 1); \
|
||||
x( 2); \
|
||||
x( 3); \
|
||||
x( 4); \
|
||||
x( 5); \
|
||||
x( 6); \
|
||||
x( 7); \
|
||||
x( 8); \
|
||||
x( 9); \
|
||||
x(10); \
|
||||
x(11); \
|
||||
x(12); \
|
||||
x(13); \
|
||||
x(14); \
|
||||
x(15);
|
||||
|
||||
#define FOR_EACH_WORD_REV(x) \
|
||||
x(15); \
|
||||
x(14); \
|
||||
x(13); \
|
||||
x(12); \
|
||||
x(11); \
|
||||
x(10); \
|
||||
x( 9); \
|
||||
x( 8); \
|
||||
x( 7); \
|
||||
x( 6); \
|
||||
x( 5); \
|
||||
x( 4); \
|
||||
x( 3); \
|
||||
x( 2); \
|
||||
x( 1); \
|
||||
x( 0);
|
||||
|
||||
#define PLUS_ONE_0 1
|
||||
#define PLUS_ONE_1 2
|
||||
#define PLUS_ONE_2 3
|
||||
#define PLUS_ONE_3 4
|
||||
#define PLUS_ONE_4 5
|
||||
#define PLUS_ONE_5 6
|
||||
#define PLUS_ONE_6 7
|
||||
#define PLUS_ONE_7 8
|
||||
#define PLUS_ONE_8 9
|
||||
#define PLUS_ONE_9 10
|
||||
#define PLUS_ONE_10 11
|
||||
#define PLUS_ONE_11 12
|
||||
#define PLUS_ONE_12 13
|
||||
#define PLUS_ONE_13 14
|
||||
#define PLUS_ONE_14 15
|
||||
#define PLUS_ONE_15 16
|
||||
#define PLUS_ONE(x) PLUS_ONE_ ## x
|
||||
#define _CONCAT3(a,b,c) a ## b ## c
|
||||
#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
|
||||
|
||||
#define STORE_UNALIGNED(x) \
|
||||
CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
|
||||
.if (x != 12); \
|
||||
lw T0, (x*4)(STATE); \
|
||||
.endif; \
|
||||
lwl T1, (x*4)+MSB ## (IN); \
|
||||
lwr T1, (x*4)+LSB ## (IN); \
|
||||
.if (x == 12); \
|
||||
addu X ## x, NONCE_0; \
|
||||
.else; \
|
||||
addu X ## x, T0; \
|
||||
.endif; \
|
||||
CPU_TO_LE32(X ## x); \
|
||||
xor X ## x, T1; \
|
||||
swl X ## x, (x*4)+MSB ## (OUT); \
|
||||
swr X ## x, (x*4)+LSB ## (OUT);
|
||||
|
||||
#define STORE_ALIGNED(x) \
|
||||
CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
|
||||
.if (x != 12); \
|
||||
lw T0, (x*4)(STATE); \
|
||||
.endif; \
|
||||
lw T1, (x*4) ## (IN); \
|
||||
.if (x == 12); \
|
||||
addu X ## x, NONCE_0; \
|
||||
.else; \
|
||||
addu X ## x, T0; \
|
||||
.endif; \
|
||||
CPU_TO_LE32(X ## x); \
|
||||
xor X ## x, T1; \
|
||||
sw X ## x, (x*4) ## (OUT);
|
||||
|
||||
/* Jump table macro.
|
||||
* Used for setup and handling the last bytes, which are not multiple of 4.
|
||||
* X15 is free to store Xn
|
||||
* Every jumptable entry must be equal in size.
|
||||
*/
|
||||
#define JMPTBL_ALIGNED(x) \
|
||||
.Lchacha_mips_jmptbl_aligned_ ## x: ; \
|
||||
.set noreorder; \
|
||||
b .Lchacha_mips_xor_aligned_ ## x ## _b; \
|
||||
.if (x == 12); \
|
||||
addu SAVED_X, X ## x, NONCE_0; \
|
||||
.else; \
|
||||
addu SAVED_X, X ## x, SAVED_CA; \
|
||||
.endif; \
|
||||
.set reorder
|
||||
|
||||
#define JMPTBL_UNALIGNED(x) \
|
||||
.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
|
||||
.set noreorder; \
|
||||
b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
|
||||
.if (x == 12); \
|
||||
addu SAVED_X, X ## x, NONCE_0; \
|
||||
.else; \
|
||||
addu SAVED_X, X ## x, SAVED_CA; \
|
||||
.endif; \
|
||||
.set reorder
|
||||
|
||||
#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
|
||||
addu X(A), X(K); \
|
||||
addu X(B), X(L); \
|
||||
addu X(C), X(M); \
|
||||
addu X(D), X(N); \
|
||||
xor X(V), X(A); \
|
||||
xor X(W), X(B); \
|
||||
xor X(Y), X(C); \
|
||||
xor X(Z), X(D); \
|
||||
rotl X(V), S; \
|
||||
rotl X(W), S; \
|
||||
rotl X(Y), S; \
|
||||
rotl X(Z), S;
|
||||
|
||||
.text
|
||||
.set reorder
|
||||
.set noat
|
||||
.globl chacha_crypt_arch
|
||||
.ent chacha_crypt_arch
|
||||
chacha_crypt_arch:
|
||||
.frame $sp, STACK_SIZE, $ra
|
||||
|
||||
/* Load number of rounds */
|
||||
lw $at, 16($sp)
|
||||
|
||||
addiu $sp, -STACK_SIZE
|
||||
|
||||
/* Return bytes = 0. */
|
||||
beqz BYTES, .Lchacha_mips_end
|
||||
|
||||
lw NONCE_0, 48(STATE)
|
||||
|
||||
/* Save s0-s7 */
|
||||
sw $s0, 0($sp)
|
||||
sw $s1, 4($sp)
|
||||
sw $s2, 8($sp)
|
||||
sw $s3, 12($sp)
|
||||
sw $s4, 16($sp)
|
||||
sw $s5, 20($sp)
|
||||
sw $s6, 24($sp)
|
||||
sw $s7, 28($sp)
|
||||
|
||||
/* Test IN or OUT is unaligned.
|
||||
* IS_UNALIGNED = ( IN | OUT ) & 0x00000003
|
||||
*/
|
||||
or IS_UNALIGNED, IN, OUT
|
||||
andi IS_UNALIGNED, 0x3
|
||||
|
||||
b .Lchacha_rounds_start
|
||||
|
||||
.align 4
|
||||
.Loop_chacha_rounds:
|
||||
addiu IN, CHACHA20_BLOCK_SIZE
|
||||
addiu OUT, CHACHA20_BLOCK_SIZE
|
||||
addiu NONCE_0, 1
|
||||
|
||||
.Lchacha_rounds_start:
|
||||
lw X0, 0(STATE)
|
||||
lw X1, 4(STATE)
|
||||
lw X2, 8(STATE)
|
||||
lw X3, 12(STATE)
|
||||
|
||||
lw X4, 16(STATE)
|
||||
lw X5, 20(STATE)
|
||||
lw X6, 24(STATE)
|
||||
lw X7, 28(STATE)
|
||||
lw X8, 32(STATE)
|
||||
lw X9, 36(STATE)
|
||||
lw X10, 40(STATE)
|
||||
lw X11, 44(STATE)
|
||||
|
||||
move X12, NONCE_0
|
||||
lw X13, 52(STATE)
|
||||
lw X14, 56(STATE)
|
||||
lw X15, 60(STATE)
|
||||
|
||||
.Loop_chacha_xor_rounds:
|
||||
addiu $at, -2
|
||||
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
|
||||
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
|
||||
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
|
||||
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
|
||||
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
|
||||
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
|
||||
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
|
||||
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
|
||||
bnez $at, .Loop_chacha_xor_rounds
|
||||
|
||||
addiu BYTES, -(CHACHA20_BLOCK_SIZE)
|
||||
|
||||
/* Is data src/dst unaligned? Jump */
|
||||
bnez IS_UNALIGNED, .Loop_chacha_unaligned
|
||||
|
||||
/* Set number rounds here to fill delayslot. */
|
||||
lw $at, (STACK_SIZE+16)($sp)
|
||||
|
||||
/* BYTES < 0, it has no full block. */
|
||||
bltz BYTES, .Lchacha_mips_no_full_block_aligned
|
||||
|
||||
FOR_EACH_WORD_REV(STORE_ALIGNED)
|
||||
|
||||
/* BYTES > 0? Loop again. */
|
||||
bgtz BYTES, .Loop_chacha_rounds
|
||||
|
||||
/* Place this here to fill delay slot */
|
||||
addiu NONCE_0, 1
|
||||
|
||||
/* BYTES < 0? Handle last bytes */
|
||||
bltz BYTES, .Lchacha_mips_xor_bytes
|
||||
|
||||
.Lchacha_mips_xor_done:
|
||||
/* Restore used registers */
|
||||
lw $s0, 0($sp)
|
||||
lw $s1, 4($sp)
|
||||
lw $s2, 8($sp)
|
||||
lw $s3, 12($sp)
|
||||
lw $s4, 16($sp)
|
||||
lw $s5, 20($sp)
|
||||
lw $s6, 24($sp)
|
||||
lw $s7, 28($sp)
|
||||
|
||||
/* Write NONCE_0 back to right location in state */
|
||||
sw NONCE_0, 48(STATE)
|
||||
|
||||
.Lchacha_mips_end:
|
||||
addiu $sp, STACK_SIZE
|
||||
jr $ra
|
||||
|
||||
.Lchacha_mips_no_full_block_aligned:
|
||||
/* Restore the offset on BYTES */
|
||||
addiu BYTES, CHACHA20_BLOCK_SIZE
|
||||
|
||||
/* Get number of full WORDS */
|
||||
andi $at, BYTES, MASK_U32
|
||||
|
||||
/* Load upper half of jump table addr */
|
||||
lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
|
||||
|
||||
/* Calculate lower half jump table offset */
|
||||
ins T0, $at, 1, 6
|
||||
|
||||
/* Add offset to STATE */
|
||||
addu T1, STATE, $at
|
||||
|
||||
/* Add lower half jump table addr */
|
||||
addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
|
||||
|
||||
/* Read value from STATE */
|
||||
lw SAVED_CA, 0(T1)
|
||||
|
||||
/* Store remaining bytecounter as negative value */
|
||||
subu BYTES, $at, BYTES
|
||||
|
||||
jr T0
|
||||
|
||||
/* Jump table */
|
||||
FOR_EACH_WORD(JMPTBL_ALIGNED)
|
||||
|
||||
|
||||
.Loop_chacha_unaligned:
|
||||
/* Set number rounds here to fill delayslot. */
|
||||
lw $at, (STACK_SIZE+16)($sp)
|
||||
|
||||
/* BYTES > 0, it has no full block. */
|
||||
bltz BYTES, .Lchacha_mips_no_full_block_unaligned
|
||||
|
||||
FOR_EACH_WORD_REV(STORE_UNALIGNED)
|
||||
|
||||
/* BYTES > 0? Loop again. */
|
||||
bgtz BYTES, .Loop_chacha_rounds
|
||||
|
||||
/* Write NONCE_0 back to right location in state */
|
||||
sw NONCE_0, 48(STATE)
|
||||
|
||||
.set noreorder
|
||||
/* Fall through to byte handling */
|
||||
bgez BYTES, .Lchacha_mips_xor_done
|
||||
.Lchacha_mips_xor_unaligned_0_b:
|
||||
.Lchacha_mips_xor_aligned_0_b:
|
||||
/* Place this here to fill delay slot */
|
||||
addiu NONCE_0, 1
|
||||
.set reorder
|
||||
|
||||
.Lchacha_mips_xor_bytes:
|
||||
addu IN, $at
|
||||
addu OUT, $at
|
||||
/* First byte */
|
||||
lbu T1, 0(IN)
|
||||
addiu $at, BYTES, 1
|
||||
CPU_TO_LE32(SAVED_X)
|
||||
ROTR(SAVED_X)
|
||||
xor T1, SAVED_X
|
||||
sb T1, 0(OUT)
|
||||
beqz $at, .Lchacha_mips_xor_done
|
||||
/* Second byte */
|
||||
lbu T1, 1(IN)
|
||||
addiu $at, BYTES, 2
|
||||
ROTx SAVED_X, 8
|
||||
xor T1, SAVED_X
|
||||
sb T1, 1(OUT)
|
||||
beqz $at, .Lchacha_mips_xor_done
|
||||
/* Third byte */
|
||||
lbu T1, 2(IN)
|
||||
ROTx SAVED_X, 8
|
||||
xor T1, SAVED_X
|
||||
sb T1, 2(OUT)
|
||||
b .Lchacha_mips_xor_done
|
||||
|
||||
.Lchacha_mips_no_full_block_unaligned:
|
||||
/* Restore the offset on BYTES */
|
||||
addiu BYTES, CHACHA20_BLOCK_SIZE
|
||||
|
||||
/* Get number of full WORDS */
|
||||
andi $at, BYTES, MASK_U32
|
||||
|
||||
/* Load upper half of jump table addr */
|
||||
lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
|
||||
|
||||
/* Calculate lower half jump table offset */
|
||||
ins T0, $at, 1, 6
|
||||
|
||||
/* Add offset to STATE */
|
||||
addu T1, STATE, $at
|
||||
|
||||
/* Add lower half jump table addr */
|
||||
addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
|
||||
|
||||
/* Read value from STATE */
|
||||
lw SAVED_CA, 0(T1)
|
||||
|
||||
/* Store remaining bytecounter as negative value */
|
||||
subu BYTES, $at, BYTES
|
||||
|
||||
jr T0
|
||||
|
||||
/* Jump table */
|
||||
FOR_EACH_WORD(JMPTBL_UNALIGNED)
|
||||
.end chacha_crypt_arch
|
||||
.set at
|
||||
|
||||
/* Input arguments
|
||||
* STATE $a0
|
||||
* OUT $a1
|
||||
* NROUND $a2
|
||||
*/
|
||||
|
||||
#undef X12
|
||||
#undef X13
|
||||
#undef X14
|
||||
#undef X15
|
||||
|
||||
#define X12 $a3
|
||||
#define X13 $at
|
||||
#define X14 $v0
|
||||
#define X15 STATE
|
||||
|
||||
.set noat
|
||||
.globl hchacha_block_arch
|
||||
.ent hchacha_block_arch
|
||||
hchacha_block_arch:
|
||||
.frame $sp, STACK_SIZE, $ra
|
||||
|
||||
addiu $sp, -STACK_SIZE
|
||||
|
||||
/* Save X11(s6) */
|
||||
sw X11, 0($sp)
|
||||
|
||||
lw X0, 0(STATE)
|
||||
lw X1, 4(STATE)
|
||||
lw X2, 8(STATE)
|
||||
lw X3, 12(STATE)
|
||||
lw X4, 16(STATE)
|
||||
lw X5, 20(STATE)
|
||||
lw X6, 24(STATE)
|
||||
lw X7, 28(STATE)
|
||||
lw X8, 32(STATE)
|
||||
lw X9, 36(STATE)
|
||||
lw X10, 40(STATE)
|
||||
lw X11, 44(STATE)
|
||||
lw X12, 48(STATE)
|
||||
lw X13, 52(STATE)
|
||||
lw X14, 56(STATE)
|
||||
lw X15, 60(STATE)
|
||||
|
||||
.Loop_hchacha_xor_rounds:
|
||||
addiu $a2, -2
|
||||
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
|
||||
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
|
||||
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
|
||||
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
|
||||
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
|
||||
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
|
||||
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
|
||||
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
|
||||
bnez $a2, .Loop_hchacha_xor_rounds
|
||||
|
||||
/* Restore used register */
|
||||
lw X11, 0($sp)
|
||||
|
||||
sw X0, 0(OUT)
|
||||
sw X1, 4(OUT)
|
||||
sw X2, 8(OUT)
|
||||
sw X3, 12(OUT)
|
||||
sw X12, 16(OUT)
|
||||
sw X13, 20(OUT)
|
||||
sw X14, 24(OUT)
|
||||
sw X15, 28(OUT)
|
||||
|
||||
addiu $sp, STACK_SIZE
|
||||
jr $ra
|
||||
.end hchacha_block_arch
|
||||
.set at
|
||||
152
arch/mips/crypto/chacha-glue.c
Normal file
152
arch/mips/crypto/chacha-glue.c
Normal file
@@ -0,0 +1,152 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* MIPS accelerated ChaCha and XChaCha stream ciphers,
|
||||
* including ChaCha20 (RFC7539)
|
||||
*
|
||||
* Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
*/
|
||||
|
||||
#include <asm/byteorder.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes, int nrounds);
|
||||
EXPORT_SYMBOL(chacha_crypt_arch);
|
||||
|
||||
asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds);
|
||||
EXPORT_SYMBOL(hchacha_block_arch);
|
||||
|
||||
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
||||
{
|
||||
chacha_init_generic(state, key, iv);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_init_arch);
|
||||
|
||||
static int chacha_mips_stream_xor(struct skcipher_request *req,
|
||||
const struct chacha_ctx *ctx, const u8 *iv)
|
||||
{
|
||||
struct skcipher_walk walk;
|
||||
u32 state[16];
|
||||
int err;
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
chacha_init_generic(state, ctx->key, iv);
|
||||
|
||||
while (walk.nbytes > 0) {
|
||||
unsigned int nbytes = walk.nbytes;
|
||||
|
||||
if (nbytes < walk.total)
|
||||
nbytes = round_down(nbytes, walk.stride);
|
||||
|
||||
chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
nbytes, ctx->nrounds);
|
||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int chacha_mips(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
|
||||
return chacha_mips_stream_xor(req, ctx, req->iv);
|
||||
}
|
||||
|
||||
static int xchacha_mips(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
struct chacha_ctx subctx;
|
||||
u32 state[16];
|
||||
u8 real_iv[16];
|
||||
|
||||
chacha_init_generic(state, ctx->key, req->iv);
|
||||
|
||||
hchacha_block(state, subctx.key, ctx->nrounds);
|
||||
subctx.nrounds = ctx->nrounds;
|
||||
|
||||
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||
memcpy(&real_iv[8], req->iv + 16, 8);
|
||||
return chacha_mips_stream_xor(req, &subctx, real_iv);
|
||||
}
|
||||
|
||||
static struct skcipher_alg algs[] = {
|
||||
{
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-mips",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = chacha_mips,
|
||||
.decrypt = chacha_mips,
|
||||
}, {
|
||||
.base.cra_name = "xchacha20",
|
||||
.base.cra_driver_name = "xchacha20-mips",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = xchacha_mips,
|
||||
.decrypt = xchacha_mips,
|
||||
}, {
|
||||
.base.cra_name = "xchacha12",
|
||||
.base.cra_driver_name = "xchacha12-mips",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha12_setkey,
|
||||
.encrypt = xchacha_mips,
|
||||
.decrypt = xchacha_mips,
|
||||
}
|
||||
};
|
||||
|
||||
static int __init chacha_simd_mod_init(void)
|
||||
{
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
|
||||
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
|
||||
}
|
||||
|
||||
static void __exit chacha_simd_mod_fini(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER))
|
||||
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
||||
}
|
||||
|
||||
module_init(chacha_simd_mod_init);
|
||||
module_exit(chacha_simd_mod_fini);
|
||||
|
||||
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)");
|
||||
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_ALIAS_CRYPTO("chacha20");
|
||||
MODULE_ALIAS_CRYPTO("chacha20-mips");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20-mips");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12-mips");
|
||||
191
arch/mips/crypto/poly1305-glue.c
Normal file
191
arch/mips/crypto/poly1305-glue.c
Normal file
@@ -0,0 +1,191 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
|
||||
*
|
||||
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||
*/
|
||||
|
||||
#include <asm/unaligned.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/internal/poly1305.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
asmlinkage void poly1305_init_mips(void *state, const u8 *key);
|
||||
asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
|
||||
asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce);
|
||||
|
||||
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
|
||||
{
|
||||
poly1305_init_mips(&dctx->h, key);
|
||||
dctx->s[0] = get_unaligned_le32(key + 16);
|
||||
dctx->s[1] = get_unaligned_le32(key + 20);
|
||||
dctx->s[2] = get_unaligned_le32(key + 24);
|
||||
dctx->s[3] = get_unaligned_le32(key + 28);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_init_arch);
|
||||
|
||||
static int mips_poly1305_init(struct shash_desc *desc)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
dctx->buflen = 0;
|
||||
dctx->rset = 0;
|
||||
dctx->sset = false;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
u32 len, u32 hibit)
|
||||
{
|
||||
if (unlikely(!dctx->sset)) {
|
||||
if (!dctx->rset) {
|
||||
poly1305_init_mips(&dctx->h, src);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
dctx->rset = 1;
|
||||
}
|
||||
if (len >= POLY1305_BLOCK_SIZE) {
|
||||
dctx->s[0] = get_unaligned_le32(src + 0);
|
||||
dctx->s[1] = get_unaligned_le32(src + 4);
|
||||
dctx->s[2] = get_unaligned_le32(src + 8);
|
||||
dctx->s[3] = get_unaligned_le32(src + 12);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
dctx->sset = true;
|
||||
}
|
||||
if (len < POLY1305_BLOCK_SIZE)
|
||||
return;
|
||||
}
|
||||
|
||||
len &= ~(POLY1305_BLOCK_SIZE - 1);
|
||||
|
||||
poly1305_blocks_mips(&dctx->h, src, len, hibit);
|
||||
}
|
||||
|
||||
static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
|
||||
unsigned int len)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
if (unlikely(dctx->buflen)) {
|
||||
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
|
||||
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||
src += bytes;
|
||||
len -= bytes;
|
||||
dctx->buflen += bytes;
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(len >= POLY1305_BLOCK_SIZE)) {
|
||||
mips_poly1305_blocks(dctx, src, len, 1);
|
||||
src += round_down(len, POLY1305_BLOCK_SIZE);
|
||||
len %= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (unlikely(len)) {
|
||||
dctx->buflen = len;
|
||||
memcpy(dctx->buf, src, len);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
unsigned int nbytes)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
|
||||
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||
src += bytes;
|
||||
nbytes -= bytes;
|
||||
dctx->buflen += bytes;
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
poly1305_blocks_mips(&dctx->h, dctx->buf,
|
||||
POLY1305_BLOCK_SIZE, 1);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
|
||||
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
|
||||
|
||||
poly1305_blocks_mips(&dctx->h, src, len, 1);
|
||||
src += len;
|
||||
nbytes %= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (unlikely(nbytes)) {
|
||||
dctx->buflen = nbytes;
|
||||
memcpy(dctx->buf, src, nbytes);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_update_arch);
|
||||
|
||||
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
dctx->buf[dctx->buflen++] = 1;
|
||||
memset(dctx->buf + dctx->buflen, 0,
|
||||
POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
|
||||
}
|
||||
|
||||
poly1305_emit_mips(&dctx->h, dst, dctx->s);
|
||||
*dctx = (struct poly1305_desc_ctx){};
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_final_arch);
|
||||
|
||||
static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
if (unlikely(!dctx->sset))
|
||||
return -ENOKEY;
|
||||
|
||||
poly1305_final_arch(dctx, dst);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg mips_poly1305_alg = {
|
||||
.init = mips_poly1305_init,
|
||||
.update = mips_poly1305_update,
|
||||
.final = mips_poly1305_final,
|
||||
.digestsize = POLY1305_DIGEST_SIZE,
|
||||
.descsize = sizeof(struct poly1305_desc_ctx),
|
||||
|
||||
.base.cra_name = "poly1305",
|
||||
.base.cra_driver_name = "poly1305-mips",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init mips_poly1305_mod_init(void)
|
||||
{
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||
crypto_register_shash(&mips_poly1305_alg) : 0;
|
||||
}
|
||||
|
||||
static void __exit mips_poly1305_mod_exit(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
||||
crypto_unregister_shash(&mips_poly1305_alg);
|
||||
}
|
||||
|
||||
module_init(mips_poly1305_mod_init);
|
||||
module_exit(mips_poly1305_mod_exit);
|
||||
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_ALIAS_CRYPTO("poly1305");
|
||||
MODULE_ALIAS_CRYPTO("poly1305-mips");
|
||||
1273
arch/mips/crypto/poly1305-mips.pl
Normal file
1273
arch/mips/crypto/poly1305-mips.pl
Normal file
File diff suppressed because it is too large
Load Diff
@@ -12,6 +12,8 @@
|
||||
#ifndef _ASM_POWERPC_LMB_H
|
||||
#define _ASM_POWERPC_LMB_H
|
||||
|
||||
#include <linux/sched.h>
|
||||
|
||||
struct drmem_lmb {
|
||||
u64 base_addr;
|
||||
u32 drc_index;
|
||||
@@ -27,8 +29,22 @@ struct drmem_lmb_info {
|
||||
|
||||
extern struct drmem_lmb_info *drmem_info;
|
||||
|
||||
static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb,
|
||||
const struct drmem_lmb *start)
|
||||
{
|
||||
/*
|
||||
* DLPAR code paths can take several milliseconds per element
|
||||
* when interacting with firmware. Ensure that we don't
|
||||
* unfairly monopolize the CPU.
|
||||
*/
|
||||
if (((++lmb - start) % 16) == 0)
|
||||
cond_resched();
|
||||
|
||||
return lmb;
|
||||
}
|
||||
|
||||
#define for_each_drmem_lmb_in_range(lmb, start, end) \
|
||||
for ((lmb) = (start); (lmb) < (end); (lmb)++)
|
||||
for ((lmb) = (start); (lmb) < (end); lmb = drmem_lmb_next(lmb, start))
|
||||
|
||||
#define for_each_drmem_lmb(lmb) \
|
||||
for_each_drmem_lmb_in_range((lmb), \
|
||||
|
||||
@@ -788,7 +788,7 @@
|
||||
#define THRM1_TIN (1 << 31)
|
||||
#define THRM1_TIV (1 << 30)
|
||||
#define THRM1_THRES(x) ((x&0x7f)<<23)
|
||||
#define THRM3_SITV(x) ((x&0x3fff)<<1)
|
||||
#define THRM3_SITV(x) ((x & 0x1fff) << 1)
|
||||
#define THRM1_TID (1<<2)
|
||||
#define THRM1_TIE (1<<1)
|
||||
#define THRM1_V (1<<0)
|
||||
|
||||
@@ -76,19 +76,6 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
|
||||
return false;
|
||||
return cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm));
|
||||
}
|
||||
static inline void mm_reset_thread_local(struct mm_struct *mm)
|
||||
{
|
||||
WARN_ON(atomic_read(&mm->context.copros) > 0);
|
||||
/*
|
||||
* It's possible for mm_access to take a reference on mm_users to
|
||||
* access the remote mm from another thread, but it's not allowed
|
||||
* to set mm_cpumask, so mm_users may be > 1 here.
|
||||
*/
|
||||
WARN_ON(current->mm != mm);
|
||||
atomic_set(&mm->context.active_cpus, 1);
|
||||
cpumask_clear(mm_cpumask(mm));
|
||||
cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
|
||||
}
|
||||
#else /* CONFIG_PPC_BOOK3S_64 */
|
||||
static inline int mm_is_thread_local(struct mm_struct *mm)
|
||||
{
|
||||
|
||||
@@ -13,13 +13,14 @@
|
||||
*/
|
||||
|
||||
#include <linux/errno.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/param.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
#include <asm/io.h>
|
||||
#include <asm/reg.h>
|
||||
@@ -39,9 +40,7 @@ static struct tau_temp
|
||||
unsigned char grew;
|
||||
} tau[NR_CPUS];
|
||||
|
||||
struct timer_list tau_timer;
|
||||
|
||||
#undef DEBUG
|
||||
static bool tau_int_enable;
|
||||
|
||||
/* TODO: put these in a /proc interface, with some sanity checks, and maybe
|
||||
* dynamic adjustment to minimize # of interrupts */
|
||||
@@ -50,72 +49,49 @@ struct timer_list tau_timer;
|
||||
#define step_size 2 /* step size when temp goes out of range */
|
||||
#define window_expand 1 /* expand the window by this much */
|
||||
/* configurable values for shrinking the window */
|
||||
#define shrink_timer 2*HZ /* period between shrinking the window */
|
||||
#define shrink_timer 2000 /* period between shrinking the window */
|
||||
#define min_window 2 /* minimum window size, degrees C */
|
||||
|
||||
static void set_thresholds(unsigned long cpu)
|
||||
{
|
||||
#ifdef CONFIG_TAU_INT
|
||||
/*
|
||||
* setup THRM1,
|
||||
* threshold, valid bit, enable interrupts, interrupt when below threshold
|
||||
*/
|
||||
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TIE | THRM1_TID);
|
||||
u32 maybe_tie = tau_int_enable ? THRM1_TIE : 0;
|
||||
|
||||
/* setup THRM2,
|
||||
* threshold, valid bit, enable interrupts, interrupt when above threshold
|
||||
*/
|
||||
mtspr (SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | THRM1_TIE);
|
||||
#else
|
||||
/* same thing but don't enable interrupts */
|
||||
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TID);
|
||||
mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V);
|
||||
#endif
|
||||
/* setup THRM1, threshold, valid bit, interrupt when below threshold */
|
||||
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | maybe_tie | THRM1_TID);
|
||||
|
||||
/* setup THRM2, threshold, valid bit, interrupt when above threshold */
|
||||
mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | maybe_tie);
|
||||
}
|
||||
|
||||
static void TAUupdate(int cpu)
|
||||
{
|
||||
unsigned thrm;
|
||||
|
||||
#ifdef DEBUG
|
||||
printk("TAUupdate ");
|
||||
#endif
|
||||
u32 thrm;
|
||||
u32 bits = THRM1_TIV | THRM1_TIN | THRM1_V;
|
||||
|
||||
/* if both thresholds are crossed, the step_sizes cancel out
|
||||
* and the window winds up getting expanded twice. */
|
||||
if((thrm = mfspr(SPRN_THRM1)) & THRM1_TIV){ /* is valid? */
|
||||
if(thrm & THRM1_TIN){ /* crossed low threshold */
|
||||
if (tau[cpu].low >= step_size){
|
||||
tau[cpu].low -= step_size;
|
||||
tau[cpu].high -= (step_size - window_expand);
|
||||
}
|
||||
tau[cpu].grew = 1;
|
||||
#ifdef DEBUG
|
||||
printk("low threshold crossed ");
|
||||
#endif
|
||||
thrm = mfspr(SPRN_THRM1);
|
||||
if ((thrm & bits) == bits) {
|
||||
mtspr(SPRN_THRM1, 0);
|
||||
|
||||
if (tau[cpu].low >= step_size) {
|
||||
tau[cpu].low -= step_size;
|
||||
tau[cpu].high -= (step_size - window_expand);
|
||||
}
|
||||
tau[cpu].grew = 1;
|
||||
pr_debug("%s: low threshold crossed\n", __func__);
|
||||
}
|
||||
if((thrm = mfspr(SPRN_THRM2)) & THRM1_TIV){ /* is valid? */
|
||||
if(thrm & THRM1_TIN){ /* crossed high threshold */
|
||||
if (tau[cpu].high <= 127-step_size){
|
||||
tau[cpu].low += (step_size - window_expand);
|
||||
tau[cpu].high += step_size;
|
||||
}
|
||||
tau[cpu].grew = 1;
|
||||
#ifdef DEBUG
|
||||
printk("high threshold crossed ");
|
||||
#endif
|
||||
thrm = mfspr(SPRN_THRM2);
|
||||
if ((thrm & bits) == bits) {
|
||||
mtspr(SPRN_THRM2, 0);
|
||||
|
||||
if (tau[cpu].high <= 127 - step_size) {
|
||||
tau[cpu].low += (step_size - window_expand);
|
||||
tau[cpu].high += step_size;
|
||||
}
|
||||
tau[cpu].grew = 1;
|
||||
pr_debug("%s: high threshold crossed\n", __func__);
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printk("grew = %d\n", tau[cpu].grew);
|
||||
#endif
|
||||
|
||||
#ifndef CONFIG_TAU_INT /* tau_timeout will do this if not using interrupts */
|
||||
set_thresholds(cpu);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#ifdef CONFIG_TAU_INT
|
||||
@@ -140,17 +116,16 @@ void TAUException(struct pt_regs * regs)
|
||||
static void tau_timeout(void * info)
|
||||
{
|
||||
int cpu;
|
||||
unsigned long flags;
|
||||
int size;
|
||||
int shrink;
|
||||
|
||||
/* disabling interrupts *should* be okay */
|
||||
local_irq_save(flags);
|
||||
cpu = smp_processor_id();
|
||||
|
||||
#ifndef CONFIG_TAU_INT
|
||||
TAUupdate(cpu);
|
||||
#endif
|
||||
if (!tau_int_enable)
|
||||
TAUupdate(cpu);
|
||||
|
||||
/* Stop thermal sensor comparisons and interrupts */
|
||||
mtspr(SPRN_THRM3, 0);
|
||||
|
||||
size = tau[cpu].high - tau[cpu].low;
|
||||
if (size > min_window && ! tau[cpu].grew) {
|
||||
@@ -173,32 +148,26 @@ static void tau_timeout(void * info)
|
||||
|
||||
set_thresholds(cpu);
|
||||
|
||||
/*
|
||||
* Do the enable every time, since otherwise a bunch of (relatively)
|
||||
* complex sleep code needs to be added. One mtspr every time
|
||||
* tau_timeout is called is probably not a big deal.
|
||||
*
|
||||
* Enable thermal sensor and set up sample interval timer
|
||||
* need 20 us to do the compare.. until a nice 'cpu_speed' function
|
||||
* call is implemented, just assume a 500 mhz clock. It doesn't really
|
||||
* matter if we take too long for a compare since it's all interrupt
|
||||
* driven anyway.
|
||||
*
|
||||
* use a extra long time.. (60 us @ 500 mhz)
|
||||
/* Restart thermal sensor comparisons and interrupts.
|
||||
* The "PowerPC 740 and PowerPC 750 Microprocessor Datasheet"
|
||||
* recommends that "the maximum value be set in THRM3 under all
|
||||
* conditions."
|
||||
*/
|
||||
mtspr(SPRN_THRM3, THRM3_SITV(500*60) | THRM3_E);
|
||||
|
||||
local_irq_restore(flags);
|
||||
mtspr(SPRN_THRM3, THRM3_SITV(0x1fff) | THRM3_E);
|
||||
}
|
||||
|
||||
static void tau_timeout_smp(struct timer_list *unused)
|
||||
static struct workqueue_struct *tau_workq;
|
||||
|
||||
static void tau_work_func(struct work_struct *work)
|
||||
{
|
||||
|
||||
/* schedule ourselves to be run again */
|
||||
mod_timer(&tau_timer, jiffies + shrink_timer) ;
|
||||
msleep(shrink_timer);
|
||||
on_each_cpu(tau_timeout, NULL, 0);
|
||||
/* schedule ourselves to be run again */
|
||||
queue_work(tau_workq, work);
|
||||
}
|
||||
|
||||
DECLARE_WORK(tau_work, tau_work_func);
|
||||
|
||||
/*
|
||||
* setup the TAU
|
||||
*
|
||||
@@ -231,21 +200,19 @@ static int __init TAU_init(void)
|
||||
return 1;
|
||||
}
|
||||
|
||||
tau_int_enable = IS_ENABLED(CONFIG_TAU_INT) &&
|
||||
!strcmp(cur_cpu_spec->platform, "ppc750");
|
||||
|
||||
/* first, set up the window shrinking timer */
|
||||
timer_setup(&tau_timer, tau_timeout_smp, 0);
|
||||
tau_timer.expires = jiffies + shrink_timer;
|
||||
add_timer(&tau_timer);
|
||||
tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1);
|
||||
if (!tau_workq)
|
||||
return -ENOMEM;
|
||||
|
||||
on_each_cpu(TAU_init_smp, NULL, 0);
|
||||
|
||||
printk("Thermal assist unit ");
|
||||
#ifdef CONFIG_TAU_INT
|
||||
printk("using interrupts, ");
|
||||
#else
|
||||
printk("using timers, ");
|
||||
#endif
|
||||
printk("shrink_timer: %d jiffies\n", shrink_timer);
|
||||
queue_work(tau_workq, &tau_work);
|
||||
|
||||
pr_info("Thermal assist unit using %s, shrink_timer: %d ms\n",
|
||||
tau_int_enable ? "interrupts" : "workqueue", shrink_timer);
|
||||
tau_initialized = 1;
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -598,19 +598,29 @@ static void do_exit_flush_lazy_tlb(void *arg)
|
||||
struct mm_struct *mm = arg;
|
||||
unsigned long pid = mm->context.id;
|
||||
|
||||
/*
|
||||
* A kthread could have done a mmget_not_zero() after the flushing CPU
|
||||
* checked mm_is_singlethreaded, and be in the process of
|
||||
* kthread_use_mm when interrupted here. In that case, current->mm will
|
||||
* be set to mm, because kthread_use_mm() setting ->mm and switching to
|
||||
* the mm is done with interrupts off.
|
||||
*/
|
||||
if (current->mm == mm)
|
||||
return; /* Local CPU */
|
||||
goto out_flush;
|
||||
|
||||
if (current->active_mm == mm) {
|
||||
/*
|
||||
* Must be a kernel thread because sender is single-threaded.
|
||||
*/
|
||||
BUG_ON(current->mm);
|
||||
WARN_ON_ONCE(current->mm != NULL);
|
||||
/* Is a kernel thread and is using mm as the lazy tlb */
|
||||
mmgrab(&init_mm);
|
||||
switch_mm(mm, &init_mm, current);
|
||||
current->active_mm = &init_mm;
|
||||
switch_mm_irqs_off(mm, &init_mm, current);
|
||||
mmdrop(mm);
|
||||
}
|
||||
|
||||
atomic_dec(&mm->context.active_cpus);
|
||||
cpumask_clear_cpu(smp_processor_id(), mm_cpumask(mm));
|
||||
|
||||
out_flush:
|
||||
_tlbiel_pid(pid, RIC_FLUSH_ALL);
|
||||
}
|
||||
|
||||
@@ -625,7 +635,6 @@ static void exit_flush_lazy_tlbs(struct mm_struct *mm)
|
||||
*/
|
||||
smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
|
||||
(void *)mm, 1);
|
||||
mm_reset_thread_local(mm);
|
||||
}
|
||||
|
||||
void radix__flush_tlb_mm(struct mm_struct *mm)
|
||||
|
||||
@@ -95,7 +95,7 @@ REQUEST(__field(0, 8, partition_id)
|
||||
|
||||
#define REQUEST_NAME system_performance_capabilities
|
||||
#define REQUEST_NUM 0x40
|
||||
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
|
||||
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
|
||||
#include I(REQUEST_BEGIN)
|
||||
REQUEST(__field(0, 1, perf_collect_privileged)
|
||||
__field(0x1, 1, capability_mask)
|
||||
@@ -223,7 +223,7 @@ REQUEST(__field(0, 2, partition_id)
|
||||
|
||||
#define REQUEST_NAME system_hypervisor_times
|
||||
#define REQUEST_NUM 0xF0
|
||||
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
|
||||
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
|
||||
#include I(REQUEST_BEGIN)
|
||||
REQUEST(__count(0, 8, time_spent_to_dispatch_virtual_processors)
|
||||
__count(0x8, 8, time_spent_processing_virtual_processor_timers)
|
||||
@@ -234,7 +234,7 @@ REQUEST(__count(0, 8, time_spent_to_dispatch_virtual_processors)
|
||||
|
||||
#define REQUEST_NAME system_tlbie_count_and_time
|
||||
#define REQUEST_NUM 0xF4
|
||||
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
|
||||
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
|
||||
#include I(REQUEST_BEGIN)
|
||||
REQUEST(__count(0, 8, tlbie_instructions_issued)
|
||||
/*
|
||||
|
||||
@@ -273,6 +273,15 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
|
||||
|
||||
mask |= CNST_PMC_MASK(pmc);
|
||||
value |= CNST_PMC_VAL(pmc);
|
||||
|
||||
/*
|
||||
* PMC5 and PMC6 are used to count cycles and instructions and
|
||||
* they do not support most of the constraint bits. Add a check
|
||||
* to exclude PMC5/6 from most of the constraints except for
|
||||
* EBB/BHRB.
|
||||
*/
|
||||
if (pmc >= 5)
|
||||
goto ebb_bhrb;
|
||||
}
|
||||
|
||||
if (pmc <= 4) {
|
||||
@@ -331,6 +340,7 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
|
||||
}
|
||||
}
|
||||
|
||||
ebb_bhrb:
|
||||
if (!pmc && ebb)
|
||||
/* EBB events must specify the PMC */
|
||||
return -1;
|
||||
|
||||
@@ -238,12 +238,11 @@ config TAU
|
||||
temperature within 2-4 degrees Celsius. This option shows the current
|
||||
on-die temperature in /proc/cpuinfo if the cpu supports it.
|
||||
|
||||
Unfortunately, on some chip revisions, this sensor is very inaccurate
|
||||
and in many cases, does not work at all, so don't assume the cpu
|
||||
temp is actually what /proc/cpuinfo says it is.
|
||||
Unfortunately, this sensor is very inaccurate when uncalibrated, so
|
||||
don't assume the cpu temp is actually what /proc/cpuinfo says it is.
|
||||
|
||||
config TAU_INT
|
||||
bool "Interrupt driven TAU driver (DANGEROUS)"
|
||||
bool "Interrupt driven TAU driver (EXPERIMENTAL)"
|
||||
depends on TAU
|
||||
---help---
|
||||
The TAU supports an interrupt driven mode which causes an interrupt
|
||||
@@ -251,12 +250,7 @@ config TAU_INT
|
||||
to get notified the temp has exceeded a range. With this option off,
|
||||
a timer is used to re-check the temperature periodically.
|
||||
|
||||
However, on some cpus it appears that the TAU interrupt hardware
|
||||
is buggy and can cause a situation which would lead unexplained hard
|
||||
lockups.
|
||||
|
||||
Unless you are extending the TAU driver, or enjoy kernel/hardware
|
||||
debugging, leave this option off.
|
||||
If in doubt, say N here.
|
||||
|
||||
config TAU_AVERAGE
|
||||
bool "Average high and low temp"
|
||||
|
||||
@@ -322,15 +322,14 @@ static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
|
||||
return count;
|
||||
}
|
||||
|
||||
static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
|
||||
uint32_t type)
|
||||
static void create_dump_obj(uint32_t id, size_t size, uint32_t type)
|
||||
{
|
||||
struct dump_obj *dump;
|
||||
int rc;
|
||||
|
||||
dump = kzalloc(sizeof(*dump), GFP_KERNEL);
|
||||
if (!dump)
|
||||
return NULL;
|
||||
return;
|
||||
|
||||
dump->kobj.kset = dump_kset;
|
||||
|
||||
@@ -350,21 +349,39 @@ static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
|
||||
rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
|
||||
if (rc) {
|
||||
kobject_put(&dump->kobj);
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* As soon as the sysfs file for this dump is created/activated there is
|
||||
* a chance the opal_errd daemon (or any userspace) might read and
|
||||
* acknowledge the dump before kobject_uevent() is called. If that
|
||||
* happens then there is a potential race between
|
||||
* dump_ack_store->kobject_put() and kobject_uevent() which leads to a
|
||||
* use-after-free of a kernfs object resulting in a kernel crash.
|
||||
*
|
||||
* To avoid that, we need to take a reference on behalf of the bin file,
|
||||
* so that our reference remains valid while we call kobject_uevent().
|
||||
* We then drop our reference before exiting the function, leaving the
|
||||
* bin file to drop the last reference (if it hasn't already).
|
||||
*/
|
||||
|
||||
/* Take a reference for the bin file */
|
||||
kobject_get(&dump->kobj);
|
||||
rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
|
||||
if (rc) {
|
||||
if (rc == 0) {
|
||||
kobject_uevent(&dump->kobj, KOBJ_ADD);
|
||||
|
||||
pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
|
||||
__func__, dump->id, dump->size);
|
||||
} else {
|
||||
/* Drop reference count taken for bin file */
|
||||
kobject_put(&dump->kobj);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
|
||||
__func__, dump->id, dump->size);
|
||||
|
||||
kobject_uevent(&dump->kobj, KOBJ_ADD);
|
||||
|
||||
return dump;
|
||||
/* Drop our reference */
|
||||
kobject_put(&dump->kobj);
|
||||
return;
|
||||
}
|
||||
|
||||
static irqreturn_t process_dump(int irq, void *data)
|
||||
|
||||
@@ -40,6 +40,7 @@ static __init int rng_init(void)
|
||||
|
||||
ppc_md.get_random_seed = pseries_get_random_long;
|
||||
|
||||
of_node_put(dn);
|
||||
return 0;
|
||||
}
|
||||
machine_subsys_initcall(pseries, rng_init);
|
||||
|
||||
@@ -179,6 +179,7 @@ int icp_hv_init(void)
|
||||
|
||||
icp_ops = &icp_hv_ops;
|
||||
|
||||
of_node_put(np);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -200,9 +200,10 @@ avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
|
||||
avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
|
||||
sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
|
||||
sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
|
||||
adx_instr := $(call as-instr,adox %r10$(comma)%r10,-DCONFIG_AS_ADX=1)
|
||||
|
||||
KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
|
||||
KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
|
||||
KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
|
||||
KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
|
||||
|
||||
KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
|
||||
|
||||
|
||||
@@ -40,6 +40,7 @@ CONFIG_EMBEDDED=y
|
||||
# CONFIG_SLAB_MERGE_DEFAULT is not set
|
||||
CONFIG_PROFILING=y
|
||||
CONFIG_SMP=y
|
||||
CONFIG_X86_X2APIC=y
|
||||
CONFIG_HYPERVISOR_GUEST=y
|
||||
CONFIG_PARAVIRT=y
|
||||
CONFIG_NR_CPUS=32
|
||||
@@ -213,6 +214,7 @@ CONFIG_DM_VERITY_FEC=y
|
||||
CONFIG_DM_BOW=y
|
||||
CONFIG_NETDEVICES=y
|
||||
CONFIG_DUMMY=y
|
||||
CONFIG_WIREGUARD=y
|
||||
CONFIG_TUN=y
|
||||
CONFIG_VETH=y
|
||||
# CONFIG_ETHERNET is not set
|
||||
@@ -310,6 +312,7 @@ CONFIG_HID_NINTENDO=y
|
||||
CONFIG_HID_SONY=y
|
||||
CONFIG_HID_STEAM=y
|
||||
CONFIG_USB_HIDDEV=y
|
||||
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
|
||||
CONFIG_USB_XHCI_HCD=y
|
||||
CONFIG_USB_GADGET=y
|
||||
CONFIG_USB_GADGET_VBUS_DRAW=500
|
||||
@@ -435,6 +438,7 @@ CONFIG_CRC8=y
|
||||
CONFIG_XZ_DEC=y
|
||||
CONFIG_PRINTK_TIME=y
|
||||
CONFIG_DEBUG_INFO=y
|
||||
CONFIG_DEBUG_INFO_DWARF4=y
|
||||
# CONFIG_ENABLE_MUST_CHECK is not set
|
||||
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
|
||||
CONFIG_MAGIC_SYSRQ=y
|
||||
|
||||
1
arch/x86/crypto/.gitignore
vendored
Normal file
1
arch/x86/crypto/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
poly1305-x86_64-cryptogams.S
|
||||
@@ -8,8 +8,10 @@ OBJECT_FILES_NON_STANDARD := y
|
||||
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
|
||||
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
|
||||
$(comma)4)$(comma)%ymm2,yes,no)
|
||||
avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
|
||||
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
|
||||
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
|
||||
adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no)
|
||||
|
||||
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
|
||||
|
||||
@@ -23,7 +25,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
|
||||
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
|
||||
@@ -46,6 +48,11 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
|
||||
obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
|
||||
obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
|
||||
|
||||
# These modules require the assembler to support ADX.
|
||||
ifeq ($(adx_supported),yes)
|
||||
obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
|
||||
endif
|
||||
|
||||
# These modules require assembler to support AVX.
|
||||
ifeq ($(avx_supported),yes)
|
||||
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
|
||||
@@ -54,6 +61,7 @@ ifeq ($(avx_supported),yes)
|
||||
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
|
||||
endif
|
||||
|
||||
# These modules require assembler to support AVX2.
|
||||
@@ -77,7 +85,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
|
||||
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
|
||||
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
|
||||
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
|
||||
chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
|
||||
chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
|
||||
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
|
||||
|
||||
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
|
||||
@@ -87,6 +95,12 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
|
||||
morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
|
||||
morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
|
||||
|
||||
blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
|
||||
poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
|
||||
ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
|
||||
targets += poly1305-x86_64-cryptogams.S
|
||||
endif
|
||||
|
||||
ifeq ($(avx_supported),yes)
|
||||
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
|
||||
camellia_aesni_avx_glue.o
|
||||
@@ -100,20 +114,22 @@ endif
|
||||
|
||||
ifeq ($(avx2_supported),yes)
|
||||
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
|
||||
chacha20-x86_64-y += chacha20-avx2-x86_64.o
|
||||
chacha-x86_64-y += chacha-avx2-x86_64.o
|
||||
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
|
||||
|
||||
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
|
||||
endif
|
||||
|
||||
ifeq ($(avx512_supported),yes)
|
||||
chacha-x86_64-y += chacha-avx512vl-x86_64.o
|
||||
endif
|
||||
|
||||
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
|
||||
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
|
||||
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
|
||||
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
|
||||
poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
|
||||
ifeq ($(avx2_supported),yes)
|
||||
sha1-ssse3-y += sha1_avx2_x86_64_asm.o
|
||||
poly1305-x86_64-y += poly1305-avx2-x86_64.o
|
||||
endif
|
||||
ifeq ($(sha1_ni_supported),yes)
|
||||
sha1-ssse3-y += sha1_ni_asm.o
|
||||
@@ -127,3 +143,8 @@ sha256-ssse3-y += sha256_ni_asm.o
|
||||
endif
|
||||
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
|
||||
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
|
||||
|
||||
quiet_cmd_perlasm = PERLASM $@
|
||||
cmd_perlasm = $(PERL) $< > $@
|
||||
$(obj)/%.S: $(src)/%.pl FORCE
|
||||
$(call if_changed,perlasm)
|
||||
|
||||
258
arch/x86/crypto/blake2s-core.S
Normal file
258
arch/x86/crypto/blake2s-core.S
Normal file
@@ -0,0 +1,258 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
|
||||
/*
|
||||
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||
* Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
|
||||
.align 32
|
||||
IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
|
||||
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
|
||||
.section .rodata.cst16.ROT16, "aM", @progbits, 16
|
||||
.align 16
|
||||
ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
|
||||
.section .rodata.cst16.ROR328, "aM", @progbits, 16
|
||||
.align 16
|
||||
ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
|
||||
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
|
||||
.align 64
|
||||
SIGMA:
|
||||
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
|
||||
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
|
||||
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
|
||||
.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
|
||||
.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
|
||||
.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
|
||||
.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
|
||||
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
|
||||
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
|
||||
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
|
||||
#ifdef CONFIG_AS_AVX512
|
||||
.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
|
||||
.align 64
|
||||
SIGMA2:
|
||||
.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
|
||||
.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
|
||||
.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
|
||||
.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
|
||||
.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
|
||||
.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
|
||||
.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
|
||||
.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
|
||||
.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
|
||||
.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
|
||||
#endif /* CONFIG_AS_AVX512 */
|
||||
|
||||
.text
|
||||
#ifdef CONFIG_AS_SSSE3
|
||||
ENTRY(blake2s_compress_ssse3)
|
||||
testq %rdx,%rdx
|
||||
je .Lendofloop
|
||||
movdqu (%rdi),%xmm0
|
||||
movdqu 0x10(%rdi),%xmm1
|
||||
movdqa ROT16(%rip),%xmm12
|
||||
movdqa ROR328(%rip),%xmm13
|
||||
movdqu 0x20(%rdi),%xmm14
|
||||
movq %rcx,%xmm15
|
||||
leaq SIGMA+0xa0(%rip),%r8
|
||||
jmp .Lbeginofloop
|
||||
.align 32
|
||||
.Lbeginofloop:
|
||||
movdqa %xmm0,%xmm10
|
||||
movdqa %xmm1,%xmm11
|
||||
paddq %xmm15,%xmm14
|
||||
movdqa IV(%rip),%xmm2
|
||||
movdqa %xmm14,%xmm3
|
||||
pxor IV+0x10(%rip),%xmm3
|
||||
leaq SIGMA(%rip),%rcx
|
||||
.Lroundloop:
|
||||
movzbl (%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm4
|
||||
movzbl 0x1(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm5
|
||||
movzbl 0x2(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm6
|
||||
movzbl 0x3(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm7
|
||||
punpckldq %xmm5,%xmm4
|
||||
punpckldq %xmm7,%xmm6
|
||||
punpcklqdq %xmm6,%xmm4
|
||||
paddd %xmm4,%xmm0
|
||||
paddd %xmm1,%xmm0
|
||||
pxor %xmm0,%xmm3
|
||||
pshufb %xmm12,%xmm3
|
||||
paddd %xmm3,%xmm2
|
||||
pxor %xmm2,%xmm1
|
||||
movdqa %xmm1,%xmm8
|
||||
psrld $0xc,%xmm1
|
||||
pslld $0x14,%xmm8
|
||||
por %xmm8,%xmm1
|
||||
movzbl 0x4(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm5
|
||||
movzbl 0x5(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm6
|
||||
movzbl 0x6(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm7
|
||||
movzbl 0x7(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm4
|
||||
punpckldq %xmm6,%xmm5
|
||||
punpckldq %xmm4,%xmm7
|
||||
punpcklqdq %xmm7,%xmm5
|
||||
paddd %xmm5,%xmm0
|
||||
paddd %xmm1,%xmm0
|
||||
pxor %xmm0,%xmm3
|
||||
pshufb %xmm13,%xmm3
|
||||
paddd %xmm3,%xmm2
|
||||
pxor %xmm2,%xmm1
|
||||
movdqa %xmm1,%xmm8
|
||||
psrld $0x7,%xmm1
|
||||
pslld $0x19,%xmm8
|
||||
por %xmm8,%xmm1
|
||||
pshufd $0x93,%xmm0,%xmm0
|
||||
pshufd $0x4e,%xmm3,%xmm3
|
||||
pshufd $0x39,%xmm2,%xmm2
|
||||
movzbl 0x8(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm6
|
||||
movzbl 0x9(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm7
|
||||
movzbl 0xa(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm4
|
||||
movzbl 0xb(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm5
|
||||
punpckldq %xmm7,%xmm6
|
||||
punpckldq %xmm5,%xmm4
|
||||
punpcklqdq %xmm4,%xmm6
|
||||
paddd %xmm6,%xmm0
|
||||
paddd %xmm1,%xmm0
|
||||
pxor %xmm0,%xmm3
|
||||
pshufb %xmm12,%xmm3
|
||||
paddd %xmm3,%xmm2
|
||||
pxor %xmm2,%xmm1
|
||||
movdqa %xmm1,%xmm8
|
||||
psrld $0xc,%xmm1
|
||||
pslld $0x14,%xmm8
|
||||
por %xmm8,%xmm1
|
||||
movzbl 0xc(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm7
|
||||
movzbl 0xd(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm4
|
||||
movzbl 0xe(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm5
|
||||
movzbl 0xf(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm6
|
||||
punpckldq %xmm4,%xmm7
|
||||
punpckldq %xmm6,%xmm5
|
||||
punpcklqdq %xmm5,%xmm7
|
||||
paddd %xmm7,%xmm0
|
||||
paddd %xmm1,%xmm0
|
||||
pxor %xmm0,%xmm3
|
||||
pshufb %xmm13,%xmm3
|
||||
paddd %xmm3,%xmm2
|
||||
pxor %xmm2,%xmm1
|
||||
movdqa %xmm1,%xmm8
|
||||
psrld $0x7,%xmm1
|
||||
pslld $0x19,%xmm8
|
||||
por %xmm8,%xmm1
|
||||
pshufd $0x39,%xmm0,%xmm0
|
||||
pshufd $0x4e,%xmm3,%xmm3
|
||||
pshufd $0x93,%xmm2,%xmm2
|
||||
addq $0x10,%rcx
|
||||
cmpq %r8,%rcx
|
||||
jnz .Lroundloop
|
||||
pxor %xmm2,%xmm0
|
||||
pxor %xmm3,%xmm1
|
||||
pxor %xmm10,%xmm0
|
||||
pxor %xmm11,%xmm1
|
||||
addq $0x40,%rsi
|
||||
decq %rdx
|
||||
jnz .Lbeginofloop
|
||||
movdqu %xmm0,(%rdi)
|
||||
movdqu %xmm1,0x10(%rdi)
|
||||
movdqu %xmm14,0x20(%rdi)
|
||||
.Lendofloop:
|
||||
ret
|
||||
ENDPROC(blake2s_compress_ssse3)
|
||||
#endif /* CONFIG_AS_SSSE3 */
|
||||
|
||||
#ifdef CONFIG_AS_AVX512
|
||||
ENTRY(blake2s_compress_avx512)
|
||||
vmovdqu (%rdi),%xmm0
|
||||
vmovdqu 0x10(%rdi),%xmm1
|
||||
vmovdqu 0x20(%rdi),%xmm4
|
||||
vmovq %rcx,%xmm5
|
||||
vmovdqa IV(%rip),%xmm14
|
||||
vmovdqa IV+16(%rip),%xmm15
|
||||
jmp .Lblake2s_compress_avx512_mainloop
|
||||
.align 32
|
||||
.Lblake2s_compress_avx512_mainloop:
|
||||
vmovdqa %xmm0,%xmm10
|
||||
vmovdqa %xmm1,%xmm11
|
||||
vpaddq %xmm5,%xmm4,%xmm4
|
||||
vmovdqa %xmm14,%xmm2
|
||||
vpxor %xmm15,%xmm4,%xmm3
|
||||
vmovdqu (%rsi),%ymm6
|
||||
vmovdqu 0x20(%rsi),%ymm7
|
||||
addq $0x40,%rsi
|
||||
leaq SIGMA2(%rip),%rax
|
||||
movb $0xa,%cl
|
||||
.Lblake2s_compress_avx512_roundloop:
|
||||
addq $0x40,%rax
|
||||
vmovdqa -0x40(%rax),%ymm8
|
||||
vmovdqa -0x20(%rax),%ymm9
|
||||
vpermi2d %ymm7,%ymm6,%ymm8
|
||||
vpermi2d %ymm7,%ymm6,%ymm9
|
||||
vmovdqa %ymm8,%ymm6
|
||||
vmovdqa %ymm9,%ymm7
|
||||
vpaddd %xmm8,%xmm0,%xmm0
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vprord $0x10,%xmm3,%xmm3
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
vprord $0xc,%xmm1,%xmm1
|
||||
vextracti128 $0x1,%ymm8,%xmm8
|
||||
vpaddd %xmm8,%xmm0,%xmm0
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vprord $0x8,%xmm3,%xmm3
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
vprord $0x7,%xmm1,%xmm1
|
||||
vpshufd $0x93,%xmm0,%xmm0
|
||||
vpshufd $0x4e,%xmm3,%xmm3
|
||||
vpshufd $0x39,%xmm2,%xmm2
|
||||
vpaddd %xmm9,%xmm0,%xmm0
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vprord $0x10,%xmm3,%xmm3
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
vprord $0xc,%xmm1,%xmm1
|
||||
vextracti128 $0x1,%ymm9,%xmm9
|
||||
vpaddd %xmm9,%xmm0,%xmm0
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vprord $0x8,%xmm3,%xmm3
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
vprord $0x7,%xmm1,%xmm1
|
||||
vpshufd $0x39,%xmm0,%xmm0
|
||||
vpshufd $0x4e,%xmm3,%xmm3
|
||||
vpshufd $0x93,%xmm2,%xmm2
|
||||
decb %cl
|
||||
jne .Lblake2s_compress_avx512_roundloop
|
||||
vpxor %xmm10,%xmm0,%xmm0
|
||||
vpxor %xmm11,%xmm1,%xmm1
|
||||
vpxor %xmm2,%xmm0,%xmm0
|
||||
vpxor %xmm3,%xmm1,%xmm1
|
||||
decq %rdx
|
||||
jne .Lblake2s_compress_avx512_mainloop
|
||||
vmovdqu %xmm0,(%rdi)
|
||||
vmovdqu %xmm1,0x10(%rdi)
|
||||
vmovdqu %xmm4,0x20(%rdi)
|
||||
vzeroupper
|
||||
retq
|
||||
ENDPROC(blake2s_compress_avx512)
|
||||
#endif /* CONFIG_AS_AVX512 */
|
||||
232
arch/x86/crypto/blake2s-glue.c
Normal file
232
arch/x86/crypto/blake2s-glue.c
Normal file
@@ -0,0 +1,232 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR MIT
|
||||
/*
|
||||
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||
*/
|
||||
|
||||
#include <crypto/internal/blake2s.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/fpu/api.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
|
||||
const u8 *block, const size_t nblocks,
|
||||
const u32 inc);
|
||||
asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
|
||||
const u8 *block, const size_t nblocks,
|
||||
const u32 inc);
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
|
||||
|
||||
void blake2s_compress_arch(struct blake2s_state *state,
|
||||
const u8 *block, size_t nblocks,
|
||||
const u32 inc)
|
||||
{
|
||||
/* SIMD disables preemption, so relax after processing each page. */
|
||||
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
|
||||
|
||||
if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
|
||||
blake2s_compress_generic(state, block, nblocks, inc);
|
||||
return;
|
||||
}
|
||||
|
||||
do {
|
||||
const size_t blocks = min_t(size_t, nblocks,
|
||||
SZ_4K / BLAKE2S_BLOCK_SIZE);
|
||||
|
||||
kernel_fpu_begin();
|
||||
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
||||
static_branch_likely(&blake2s_use_avx512))
|
||||
blake2s_compress_avx512(state, block, blocks, inc);
|
||||
else
|
||||
blake2s_compress_ssse3(state, block, blocks, inc);
|
||||
kernel_fpu_end();
|
||||
|
||||
nblocks -= blocks;
|
||||
block += blocks * BLAKE2S_BLOCK_SIZE;
|
||||
} while (nblocks);
|
||||
}
|
||||
EXPORT_SYMBOL(blake2s_compress_arch);
|
||||
|
||||
static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
|
||||
unsigned int keylen)
|
||||
{
|
||||
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
|
||||
|
||||
if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
|
||||
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
memcpy(tctx->key, key, keylen);
|
||||
tctx->keylen = keylen;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_blake2s_init(struct shash_desc *desc)
|
||||
{
|
||||
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
|
||||
struct blake2s_state *state = shash_desc_ctx(desc);
|
||||
const int outlen = crypto_shash_digestsize(desc->tfm);
|
||||
|
||||
if (tctx->keylen)
|
||||
blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
|
||||
else
|
||||
blake2s_init(state, outlen);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
|
||||
unsigned int inlen)
|
||||
{
|
||||
struct blake2s_state *state = shash_desc_ctx(desc);
|
||||
const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
|
||||
|
||||
if (unlikely(!inlen))
|
||||
return 0;
|
||||
if (inlen > fill) {
|
||||
memcpy(state->buf + state->buflen, in, fill);
|
||||
blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
|
||||
state->buflen = 0;
|
||||
in += fill;
|
||||
inlen -= fill;
|
||||
}
|
||||
if (inlen > BLAKE2S_BLOCK_SIZE) {
|
||||
const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
|
||||
/* Hash one less (full) block than strictly possible */
|
||||
blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
|
||||
in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
|
||||
inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
|
||||
}
|
||||
memcpy(state->buf + state->buflen, in, inlen);
|
||||
state->buflen += inlen;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
struct blake2s_state *state = shash_desc_ctx(desc);
|
||||
|
||||
blake2s_set_lastblock(state);
|
||||
memset(state->buf + state->buflen, 0,
|
||||
BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
|
||||
blake2s_compress_arch(state, state->buf, 1, state->buflen);
|
||||
cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
|
||||
memcpy(out, state->h, state->outlen);
|
||||
memzero_explicit(state, sizeof(*state));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg blake2s_algs[] = {{
|
||||
.base.cra_name = "blake2s-128",
|
||||
.base.cra_driver_name = "blake2s-128-x86",
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.digestsize = BLAKE2S_128_HASH_SIZE,
|
||||
.setkey = crypto_blake2s_setkey,
|
||||
.init = crypto_blake2s_init,
|
||||
.update = crypto_blake2s_update,
|
||||
.final = crypto_blake2s_final,
|
||||
.descsize = sizeof(struct blake2s_state),
|
||||
}, {
|
||||
.base.cra_name = "blake2s-160",
|
||||
.base.cra_driver_name = "blake2s-160-x86",
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.digestsize = BLAKE2S_160_HASH_SIZE,
|
||||
.setkey = crypto_blake2s_setkey,
|
||||
.init = crypto_blake2s_init,
|
||||
.update = crypto_blake2s_update,
|
||||
.final = crypto_blake2s_final,
|
||||
.descsize = sizeof(struct blake2s_state),
|
||||
}, {
|
||||
.base.cra_name = "blake2s-224",
|
||||
.base.cra_driver_name = "blake2s-224-x86",
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.digestsize = BLAKE2S_224_HASH_SIZE,
|
||||
.setkey = crypto_blake2s_setkey,
|
||||
.init = crypto_blake2s_init,
|
||||
.update = crypto_blake2s_update,
|
||||
.final = crypto_blake2s_final,
|
||||
.descsize = sizeof(struct blake2s_state),
|
||||
}, {
|
||||
.base.cra_name = "blake2s-256",
|
||||
.base.cra_driver_name = "blake2s-256-x86",
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.digestsize = BLAKE2S_256_HASH_SIZE,
|
||||
.setkey = crypto_blake2s_setkey,
|
||||
.init = crypto_blake2s_init,
|
||||
.update = crypto_blake2s_update,
|
||||
.final = crypto_blake2s_final,
|
||||
.descsize = sizeof(struct blake2s_state),
|
||||
}};
|
||||
|
||||
static int __init blake2s_mod_init(void)
|
||||
{
|
||||
if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
return 0;
|
||||
|
||||
static_branch_enable(&blake2s_use_ssse3);
|
||||
|
||||
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX2) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX512F) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX512VL) &&
|
||||
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
|
||||
XFEATURE_MASK_AVX512, NULL))
|
||||
static_branch_enable(&blake2s_use_avx512);
|
||||
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||
crypto_register_shashes(blake2s_algs,
|
||||
ARRAY_SIZE(blake2s_algs)) : 0;
|
||||
}
|
||||
|
||||
static void __exit blake2s_mod_exit(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
|
||||
}
|
||||
|
||||
module_init(blake2s_mod_init);
|
||||
module_exit(blake2s_mod_exit);
|
||||
|
||||
MODULE_ALIAS_CRYPTO("blake2s-128");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-128-x86");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-160");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-160-x86");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-224");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-224-x86");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-256");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-256-x86");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
1025
arch/x86/crypto/chacha-avx2-x86_64.S
Normal file
1025
arch/x86/crypto/chacha-avx2-x86_64.S
Normal file
File diff suppressed because it is too large
Load Diff
836
arch/x86/crypto/chacha-avx512vl-x86_64.S
Normal file
836
arch/x86/crypto/chacha-avx512vl-x86_64.S
Normal file
@@ -0,0 +1,836 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0+ */
|
||||
/*
|
||||
* ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
|
||||
*
|
||||
* Copyright (C) 2018 Martin Willi
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
|
||||
.align 32
|
||||
CTR2BL: .octa 0x00000000000000000000000000000000
|
||||
.octa 0x00000000000000000000000000000001
|
||||
|
||||
.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
|
||||
.align 32
|
||||
CTR4BL: .octa 0x00000000000000000000000000000002
|
||||
.octa 0x00000000000000000000000000000003
|
||||
|
||||
.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
|
||||
.align 32
|
||||
CTR8BL: .octa 0x00000003000000020000000100000000
|
||||
.octa 0x00000007000000060000000500000004
|
||||
|
||||
.text
|
||||
|
||||
ENTRY(chacha_2block_xor_avx512vl)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 2 data blocks output, o
|
||||
# %rdx: up to 2 data blocks input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
|
||||
# This function encrypts two ChaCha blocks by loading the state
|
||||
# matrix twice across four AVX registers. It performs matrix operations
|
||||
# on four words in each matrix in parallel, but requires shuffling to
|
||||
# rearrange the words after each round.
|
||||
|
||||
vzeroupper
|
||||
|
||||
# x0..3[0-2] = s0..3
|
||||
vbroadcasti128 0x00(%rdi),%ymm0
|
||||
vbroadcasti128 0x10(%rdi),%ymm1
|
||||
vbroadcasti128 0x20(%rdi),%ymm2
|
||||
vbroadcasti128 0x30(%rdi),%ymm3
|
||||
|
||||
vpaddd CTR2BL(%rip),%ymm3,%ymm3
|
||||
|
||||
vmovdqa %ymm0,%ymm8
|
||||
vmovdqa %ymm1,%ymm9
|
||||
vmovdqa %ymm2,%ymm10
|
||||
vmovdqa %ymm3,%ymm11
|
||||
|
||||
.Ldoubleround:
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $16,%ymm3,%ymm3
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $12,%ymm1,%ymm1
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $8,%ymm3,%ymm3
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $7,%ymm1,%ymm1
|
||||
|
||||
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
|
||||
vpshufd $0x39,%ymm1,%ymm1
|
||||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||
vpshufd $0x4e,%ymm2,%ymm2
|
||||
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
|
||||
vpshufd $0x93,%ymm3,%ymm3
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $16,%ymm3,%ymm3
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $12,%ymm1,%ymm1
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $8,%ymm3,%ymm3
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $7,%ymm1,%ymm1
|
||||
|
||||
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
|
||||
vpshufd $0x93,%ymm1,%ymm1
|
||||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||
vpshufd $0x4e,%ymm2,%ymm2
|
||||
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||
vpshufd $0x39,%ymm3,%ymm3
|
||||
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround
|
||||
|
||||
# o0 = i0 ^ (x0 + s0)
|
||||
vpaddd %ymm8,%ymm0,%ymm7
|
||||
cmp $0x10,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x00(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x00(%rsi)
|
||||
vextracti128 $1,%ymm7,%xmm0
|
||||
# o1 = i1 ^ (x1 + s1)
|
||||
vpaddd %ymm9,%ymm1,%ymm7
|
||||
cmp $0x20,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x10(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x10(%rsi)
|
||||
vextracti128 $1,%ymm7,%xmm1
|
||||
# o2 = i2 ^ (x2 + s2)
|
||||
vpaddd %ymm10,%ymm2,%ymm7
|
||||
cmp $0x30,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x20(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x20(%rsi)
|
||||
vextracti128 $1,%ymm7,%xmm2
|
||||
# o3 = i3 ^ (x3 + s3)
|
||||
vpaddd %ymm11,%ymm3,%ymm7
|
||||
cmp $0x40,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x30(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x30(%rsi)
|
||||
vextracti128 $1,%ymm7,%xmm3
|
||||
|
||||
# xor and write second block
|
||||
vmovdqa %xmm0,%xmm7
|
||||
cmp $0x50,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x40(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x40(%rsi)
|
||||
|
||||
vmovdqa %xmm1,%xmm7
|
||||
cmp $0x60,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x50(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x50(%rsi)
|
||||
|
||||
vmovdqa %xmm2,%xmm7
|
||||
cmp $0x70,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x60(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x60(%rsi)
|
||||
|
||||
vmovdqa %xmm3,%xmm7
|
||||
cmp $0x80,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x70(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x70(%rsi)
|
||||
|
||||
.Ldone2:
|
||||
vzeroupper
|
||||
ret
|
||||
|
||||
.Lxorpart2:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rcx,%rax
|
||||
and $0xf,%rcx
|
||||
jz .Ldone8
|
||||
mov %rax,%r9
|
||||
and $~0xf,%r9
|
||||
|
||||
mov $1,%rax
|
||||
shld %cl,%rax,%rax
|
||||
sub $1,%rax
|
||||
kmovq %rax,%k1
|
||||
|
||||
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
|
||||
vpxord %xmm7,%xmm1,%xmm1
|
||||
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
|
||||
|
||||
jmp .Ldone2
|
||||
|
||||
ENDPROC(chacha_2block_xor_avx512vl)
|
||||
|
||||
ENTRY(chacha_4block_xor_avx512vl)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 4 data blocks output, o
|
||||
# %rdx: up to 4 data blocks input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
|
||||
# This function encrypts four ChaCha blocks by loading the state
|
||||
# matrix four times across eight AVX registers. It performs matrix
|
||||
# operations on four words in two matrices in parallel, sequentially
|
||||
# to the operations on the four words of the other two matrices. The
|
||||
# required word shuffling has a rather high latency, we can do the
|
||||
# arithmetic on two matrix-pairs without much slowdown.
|
||||
|
||||
vzeroupper
|
||||
|
||||
# x0..3[0-4] = s0..3
|
||||
vbroadcasti128 0x00(%rdi),%ymm0
|
||||
vbroadcasti128 0x10(%rdi),%ymm1
|
||||
vbroadcasti128 0x20(%rdi),%ymm2
|
||||
vbroadcasti128 0x30(%rdi),%ymm3
|
||||
|
||||
vmovdqa %ymm0,%ymm4
|
||||
vmovdqa %ymm1,%ymm5
|
||||
vmovdqa %ymm2,%ymm6
|
||||
vmovdqa %ymm3,%ymm7
|
||||
|
||||
vpaddd CTR2BL(%rip),%ymm3,%ymm3
|
||||
vpaddd CTR4BL(%rip),%ymm7,%ymm7
|
||||
|
||||
vmovdqa %ymm0,%ymm11
|
||||
vmovdqa %ymm1,%ymm12
|
||||
vmovdqa %ymm2,%ymm13
|
||||
vmovdqa %ymm3,%ymm14
|
||||
vmovdqa %ymm7,%ymm15
|
||||
|
||||
.Ldoubleround4:
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $16,%ymm3,%ymm3
|
||||
|
||||
vpaddd %ymm5,%ymm4,%ymm4
|
||||
vpxord %ymm4,%ymm7,%ymm7
|
||||
vprold $16,%ymm7,%ymm7
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $12,%ymm1,%ymm1
|
||||
|
||||
vpaddd %ymm7,%ymm6,%ymm6
|
||||
vpxord %ymm6,%ymm5,%ymm5
|
||||
vprold $12,%ymm5,%ymm5
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $8,%ymm3,%ymm3
|
||||
|
||||
vpaddd %ymm5,%ymm4,%ymm4
|
||||
vpxord %ymm4,%ymm7,%ymm7
|
||||
vprold $8,%ymm7,%ymm7
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $7,%ymm1,%ymm1
|
||||
|
||||
vpaddd %ymm7,%ymm6,%ymm6
|
||||
vpxord %ymm6,%ymm5,%ymm5
|
||||
vprold $7,%ymm5,%ymm5
|
||||
|
||||
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
|
||||
vpshufd $0x39,%ymm1,%ymm1
|
||||
vpshufd $0x39,%ymm5,%ymm5
|
||||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||
vpshufd $0x4e,%ymm2,%ymm2
|
||||
vpshufd $0x4e,%ymm6,%ymm6
|
||||
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
|
||||
vpshufd $0x93,%ymm3,%ymm3
|
||||
vpshufd $0x93,%ymm7,%ymm7
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $16,%ymm3,%ymm3
|
||||
|
||||
vpaddd %ymm5,%ymm4,%ymm4
|
||||
vpxord %ymm4,%ymm7,%ymm7
|
||||
vprold $16,%ymm7,%ymm7
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $12,%ymm1,%ymm1
|
||||
|
||||
vpaddd %ymm7,%ymm6,%ymm6
|
||||
vpxord %ymm6,%ymm5,%ymm5
|
||||
vprold $12,%ymm5,%ymm5
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $8,%ymm3,%ymm3
|
||||
|
||||
vpaddd %ymm5,%ymm4,%ymm4
|
||||
vpxord %ymm4,%ymm7,%ymm7
|
||||
vprold $8,%ymm7,%ymm7
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $7,%ymm1,%ymm1
|
||||
|
||||
vpaddd %ymm7,%ymm6,%ymm6
|
||||
vpxord %ymm6,%ymm5,%ymm5
|
||||
vprold $7,%ymm5,%ymm5
|
||||
|
||||
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
|
||||
vpshufd $0x93,%ymm1,%ymm1
|
||||
vpshufd $0x93,%ymm5,%ymm5
|
||||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||
vpshufd $0x4e,%ymm2,%ymm2
|
||||
vpshufd $0x4e,%ymm6,%ymm6
|
||||
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||
vpshufd $0x39,%ymm3,%ymm3
|
||||
vpshufd $0x39,%ymm7,%ymm7
|
||||
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround4
|
||||
|
||||
# o0 = i0 ^ (x0 + s0), first block
|
||||
vpaddd %ymm11,%ymm0,%ymm10
|
||||
cmp $0x10,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x00(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x00(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm0
|
||||
# o1 = i1 ^ (x1 + s1), first block
|
||||
vpaddd %ymm12,%ymm1,%ymm10
|
||||
cmp $0x20,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x10(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x10(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm1
|
||||
# o2 = i2 ^ (x2 + s2), first block
|
||||
vpaddd %ymm13,%ymm2,%ymm10
|
||||
cmp $0x30,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x20(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x20(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm2
|
||||
# o3 = i3 ^ (x3 + s3), first block
|
||||
vpaddd %ymm14,%ymm3,%ymm10
|
||||
cmp $0x40,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x30(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x30(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm3
|
||||
|
||||
# xor and write second block
|
||||
vmovdqa %xmm0,%xmm10
|
||||
cmp $0x50,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x40(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x40(%rsi)
|
||||
|
||||
vmovdqa %xmm1,%xmm10
|
||||
cmp $0x60,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x50(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x50(%rsi)
|
||||
|
||||
vmovdqa %xmm2,%xmm10
|
||||
cmp $0x70,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x60(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x60(%rsi)
|
||||
|
||||
vmovdqa %xmm3,%xmm10
|
||||
cmp $0x80,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x70(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x70(%rsi)
|
||||
|
||||
# o0 = i0 ^ (x0 + s0), third block
|
||||
vpaddd %ymm11,%ymm4,%ymm10
|
||||
cmp $0x90,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x80(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x80(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm4
|
||||
# o1 = i1 ^ (x1 + s1), third block
|
||||
vpaddd %ymm12,%ymm5,%ymm10
|
||||
cmp $0xa0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x90(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x90(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm5
|
||||
# o2 = i2 ^ (x2 + s2), third block
|
||||
vpaddd %ymm13,%ymm6,%ymm10
|
||||
cmp $0xb0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xa0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xa0(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm6
|
||||
# o3 = i3 ^ (x3 + s3), third block
|
||||
vpaddd %ymm15,%ymm7,%ymm10
|
||||
cmp $0xc0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xb0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xb0(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm7
|
||||
|
||||
# xor and write fourth block
|
||||
vmovdqa %xmm4,%xmm10
|
||||
cmp $0xd0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xc0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xc0(%rsi)
|
||||
|
||||
vmovdqa %xmm5,%xmm10
|
||||
cmp $0xe0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xd0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xd0(%rsi)
|
||||
|
||||
vmovdqa %xmm6,%xmm10
|
||||
cmp $0xf0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xe0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xe0(%rsi)
|
||||
|
||||
vmovdqa %xmm7,%xmm10
|
||||
cmp $0x100,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xf0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xf0(%rsi)
|
||||
|
||||
.Ldone4:
|
||||
vzeroupper
|
||||
ret
|
||||
|
||||
.Lxorpart4:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rcx,%rax
|
||||
and $0xf,%rcx
|
||||
jz .Ldone8
|
||||
mov %rax,%r9
|
||||
and $~0xf,%r9
|
||||
|
||||
mov $1,%rax
|
||||
shld %cl,%rax,%rax
|
||||
sub $1,%rax
|
||||
kmovq %rax,%k1
|
||||
|
||||
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
|
||||
vpxord %xmm10,%xmm1,%xmm1
|
||||
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
|
||||
|
||||
jmp .Ldone4
|
||||
|
||||
ENDPROC(chacha_4block_xor_avx512vl)
|
||||
|
||||
ENTRY(chacha_8block_xor_avx512vl)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 8 data blocks output, o
|
||||
# %rdx: up to 8 data blocks input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
|
||||
# This function encrypts eight consecutive ChaCha blocks by loading
|
||||
# the state matrix in AVX registers eight times. Compared to AVX2, this
|
||||
# mostly benefits from the new rotate instructions in VL and the
|
||||
# additional registers.
|
||||
|
||||
vzeroupper
|
||||
|
||||
# x0..15[0-7] = s[0..15]
|
||||
vpbroadcastd 0x00(%rdi),%ymm0
|
||||
vpbroadcastd 0x04(%rdi),%ymm1
|
||||
vpbroadcastd 0x08(%rdi),%ymm2
|
||||
vpbroadcastd 0x0c(%rdi),%ymm3
|
||||
vpbroadcastd 0x10(%rdi),%ymm4
|
||||
vpbroadcastd 0x14(%rdi),%ymm5
|
||||
vpbroadcastd 0x18(%rdi),%ymm6
|
||||
vpbroadcastd 0x1c(%rdi),%ymm7
|
||||
vpbroadcastd 0x20(%rdi),%ymm8
|
||||
vpbroadcastd 0x24(%rdi),%ymm9
|
||||
vpbroadcastd 0x28(%rdi),%ymm10
|
||||
vpbroadcastd 0x2c(%rdi),%ymm11
|
||||
vpbroadcastd 0x30(%rdi),%ymm12
|
||||
vpbroadcastd 0x34(%rdi),%ymm13
|
||||
vpbroadcastd 0x38(%rdi),%ymm14
|
||||
vpbroadcastd 0x3c(%rdi),%ymm15
|
||||
|
||||
# x12 += counter values 0-3
|
||||
vpaddd CTR8BL(%rip),%ymm12,%ymm12
|
||||
|
||||
vmovdqa64 %ymm0,%ymm16
|
||||
vmovdqa64 %ymm1,%ymm17
|
||||
vmovdqa64 %ymm2,%ymm18
|
||||
vmovdqa64 %ymm3,%ymm19
|
||||
vmovdqa64 %ymm4,%ymm20
|
||||
vmovdqa64 %ymm5,%ymm21
|
||||
vmovdqa64 %ymm6,%ymm22
|
||||
vmovdqa64 %ymm7,%ymm23
|
||||
vmovdqa64 %ymm8,%ymm24
|
||||
vmovdqa64 %ymm9,%ymm25
|
||||
vmovdqa64 %ymm10,%ymm26
|
||||
vmovdqa64 %ymm11,%ymm27
|
||||
vmovdqa64 %ymm12,%ymm28
|
||||
vmovdqa64 %ymm13,%ymm29
|
||||
vmovdqa64 %ymm14,%ymm30
|
||||
vmovdqa64 %ymm15,%ymm31
|
||||
|
||||
.Ldoubleround8:
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||
vpaddd %ymm0,%ymm4,%ymm0
|
||||
vpxord %ymm0,%ymm12,%ymm12
|
||||
vprold $16,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
|
||||
vpaddd %ymm1,%ymm5,%ymm1
|
||||
vpxord %ymm1,%ymm13,%ymm13
|
||||
vprold $16,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
|
||||
vpaddd %ymm2,%ymm6,%ymm2
|
||||
vpxord %ymm2,%ymm14,%ymm14
|
||||
vprold $16,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
|
||||
vpaddd %ymm3,%ymm7,%ymm3
|
||||
vpxord %ymm3,%ymm15,%ymm15
|
||||
vprold $16,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxord %ymm8,%ymm4,%ymm4
|
||||
vprold $12,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxord %ymm9,%ymm5,%ymm5
|
||||
vprold $12,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxord %ymm10,%ymm6,%ymm6
|
||||
vprold $12,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxord %ymm11,%ymm7,%ymm7
|
||||
vprold $12,%ymm7,%ymm7
|
||||
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
|
||||
vpaddd %ymm0,%ymm4,%ymm0
|
||||
vpxord %ymm0,%ymm12,%ymm12
|
||||
vprold $8,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
|
||||
vpaddd %ymm1,%ymm5,%ymm1
|
||||
vpxord %ymm1,%ymm13,%ymm13
|
||||
vprold $8,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
|
||||
vpaddd %ymm2,%ymm6,%ymm2
|
||||
vpxord %ymm2,%ymm14,%ymm14
|
||||
vprold $8,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
|
||||
vpaddd %ymm3,%ymm7,%ymm3
|
||||
vpxord %ymm3,%ymm15,%ymm15
|
||||
vprold $8,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxord %ymm8,%ymm4,%ymm4
|
||||
vprold $7,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxord %ymm9,%ymm5,%ymm5
|
||||
vprold $7,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxord %ymm10,%ymm6,%ymm6
|
||||
vprold $7,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxord %ymm11,%ymm7,%ymm7
|
||||
vprold $7,%ymm7,%ymm7
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
|
||||
vpaddd %ymm0,%ymm5,%ymm0
|
||||
vpxord %ymm0,%ymm15,%ymm15
|
||||
vprold $16,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
|
||||
vpaddd %ymm1,%ymm6,%ymm1
|
||||
vpxord %ymm1,%ymm12,%ymm12
|
||||
vprold $16,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
|
||||
vpaddd %ymm2,%ymm7,%ymm2
|
||||
vpxord %ymm2,%ymm13,%ymm13
|
||||
vprold $16,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
|
||||
vpaddd %ymm3,%ymm4,%ymm3
|
||||
vpxord %ymm3,%ymm14,%ymm14
|
||||
vprold $16,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxord %ymm10,%ymm5,%ymm5
|
||||
vprold $12,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxord %ymm11,%ymm6,%ymm6
|
||||
vprold $12,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxord %ymm8,%ymm7,%ymm7
|
||||
vprold $12,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxord %ymm9,%ymm4,%ymm4
|
||||
vprold $12,%ymm4,%ymm4
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
|
||||
vpaddd %ymm0,%ymm5,%ymm0
|
||||
vpxord %ymm0,%ymm15,%ymm15
|
||||
vprold $8,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
|
||||
vpaddd %ymm1,%ymm6,%ymm1
|
||||
vpxord %ymm1,%ymm12,%ymm12
|
||||
vprold $8,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
|
||||
vpaddd %ymm2,%ymm7,%ymm2
|
||||
vpxord %ymm2,%ymm13,%ymm13
|
||||
vprold $8,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
|
||||
vpaddd %ymm3,%ymm4,%ymm3
|
||||
vpxord %ymm3,%ymm14,%ymm14
|
||||
vprold $8,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxord %ymm10,%ymm5,%ymm5
|
||||
vprold $7,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxord %ymm11,%ymm6,%ymm6
|
||||
vprold $7,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxord %ymm8,%ymm7,%ymm7
|
||||
vprold $7,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxord %ymm9,%ymm4,%ymm4
|
||||
vprold $7,%ymm4,%ymm4
|
||||
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround8
|
||||
|
||||
# x0..15[0-3] += s[0..15]
|
||||
vpaddd %ymm16,%ymm0,%ymm0
|
||||
vpaddd %ymm17,%ymm1,%ymm1
|
||||
vpaddd %ymm18,%ymm2,%ymm2
|
||||
vpaddd %ymm19,%ymm3,%ymm3
|
||||
vpaddd %ymm20,%ymm4,%ymm4
|
||||
vpaddd %ymm21,%ymm5,%ymm5
|
||||
vpaddd %ymm22,%ymm6,%ymm6
|
||||
vpaddd %ymm23,%ymm7,%ymm7
|
||||
vpaddd %ymm24,%ymm8,%ymm8
|
||||
vpaddd %ymm25,%ymm9,%ymm9
|
||||
vpaddd %ymm26,%ymm10,%ymm10
|
||||
vpaddd %ymm27,%ymm11,%ymm11
|
||||
vpaddd %ymm28,%ymm12,%ymm12
|
||||
vpaddd %ymm29,%ymm13,%ymm13
|
||||
vpaddd %ymm30,%ymm14,%ymm14
|
||||
vpaddd %ymm31,%ymm15,%ymm15
|
||||
|
||||
# interleave 32-bit words in state n, n+1
|
||||
vpunpckldq %ymm1,%ymm0,%ymm16
|
||||
vpunpckhdq %ymm1,%ymm0,%ymm17
|
||||
vpunpckldq %ymm3,%ymm2,%ymm18
|
||||
vpunpckhdq %ymm3,%ymm2,%ymm19
|
||||
vpunpckldq %ymm5,%ymm4,%ymm20
|
||||
vpunpckhdq %ymm5,%ymm4,%ymm21
|
||||
vpunpckldq %ymm7,%ymm6,%ymm22
|
||||
vpunpckhdq %ymm7,%ymm6,%ymm23
|
||||
vpunpckldq %ymm9,%ymm8,%ymm24
|
||||
vpunpckhdq %ymm9,%ymm8,%ymm25
|
||||
vpunpckldq %ymm11,%ymm10,%ymm26
|
||||
vpunpckhdq %ymm11,%ymm10,%ymm27
|
||||
vpunpckldq %ymm13,%ymm12,%ymm28
|
||||
vpunpckhdq %ymm13,%ymm12,%ymm29
|
||||
vpunpckldq %ymm15,%ymm14,%ymm30
|
||||
vpunpckhdq %ymm15,%ymm14,%ymm31
|
||||
|
||||
# interleave 64-bit words in state n, n+2
|
||||
vpunpcklqdq %ymm18,%ymm16,%ymm0
|
||||
vpunpcklqdq %ymm19,%ymm17,%ymm1
|
||||
vpunpckhqdq %ymm18,%ymm16,%ymm2
|
||||
vpunpckhqdq %ymm19,%ymm17,%ymm3
|
||||
vpunpcklqdq %ymm22,%ymm20,%ymm4
|
||||
vpunpcklqdq %ymm23,%ymm21,%ymm5
|
||||
vpunpckhqdq %ymm22,%ymm20,%ymm6
|
||||
vpunpckhqdq %ymm23,%ymm21,%ymm7
|
||||
vpunpcklqdq %ymm26,%ymm24,%ymm8
|
||||
vpunpcklqdq %ymm27,%ymm25,%ymm9
|
||||
vpunpckhqdq %ymm26,%ymm24,%ymm10
|
||||
vpunpckhqdq %ymm27,%ymm25,%ymm11
|
||||
vpunpcklqdq %ymm30,%ymm28,%ymm12
|
||||
vpunpcklqdq %ymm31,%ymm29,%ymm13
|
||||
vpunpckhqdq %ymm30,%ymm28,%ymm14
|
||||
vpunpckhqdq %ymm31,%ymm29,%ymm15
|
||||
|
||||
# interleave 128-bit words in state n, n+4
|
||||
# xor/write first four blocks
|
||||
vmovdqa64 %ymm0,%ymm16
|
||||
vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
|
||||
cmp $0x0020,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0000(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0000(%rsi)
|
||||
vmovdqa64 %ymm16,%ymm0
|
||||
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
|
||||
|
||||
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
|
||||
cmp $0x0040,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0020(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0020(%rsi)
|
||||
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
|
||||
|
||||
vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
|
||||
cmp $0x0060,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0040(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0040(%rsi)
|
||||
vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
|
||||
|
||||
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
|
||||
cmp $0x0080,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0060(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0060(%rsi)
|
||||
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
|
||||
|
||||
vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
|
||||
cmp $0x00a0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0080(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0080(%rsi)
|
||||
vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
|
||||
|
||||
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
|
||||
cmp $0x00c0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x00a0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x00a0(%rsi)
|
||||
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
|
||||
|
||||
vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
|
||||
cmp $0x00e0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x00c0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x00c0(%rsi)
|
||||
vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
|
||||
|
||||
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
|
||||
cmp $0x0100,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x00e0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x00e0(%rsi)
|
||||
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
|
||||
|
||||
# xor remaining blocks, write to output
|
||||
vmovdqa64 %ymm4,%ymm0
|
||||
cmp $0x0120,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0100(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0100(%rsi)
|
||||
|
||||
vmovdqa64 %ymm12,%ymm0
|
||||
cmp $0x0140,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0120(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0120(%rsi)
|
||||
|
||||
vmovdqa64 %ymm6,%ymm0
|
||||
cmp $0x0160,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0140(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0140(%rsi)
|
||||
|
||||
vmovdqa64 %ymm14,%ymm0
|
||||
cmp $0x0180,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0160(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0160(%rsi)
|
||||
|
||||
vmovdqa64 %ymm5,%ymm0
|
||||
cmp $0x01a0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0180(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0180(%rsi)
|
||||
|
||||
vmovdqa64 %ymm13,%ymm0
|
||||
cmp $0x01c0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x01a0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x01a0(%rsi)
|
||||
|
||||
vmovdqa64 %ymm7,%ymm0
|
||||
cmp $0x01e0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x01c0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x01c0(%rsi)
|
||||
|
||||
vmovdqa64 %ymm15,%ymm0
|
||||
cmp $0x0200,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x01e0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x01e0(%rsi)
|
||||
|
||||
.Ldone8:
|
||||
vzeroupper
|
||||
ret
|
||||
|
||||
.Lxorpart8:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rcx,%rax
|
||||
and $0x1f,%rcx
|
||||
jz .Ldone8
|
||||
mov %rax,%r9
|
||||
and $~0x1f,%r9
|
||||
|
||||
mov $1,%rax
|
||||
shld %cl,%rax,%rax
|
||||
sub $1,%rax
|
||||
kmovq %rax,%k1
|
||||
|
||||
vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
|
||||
vpxord %ymm0,%ymm1,%ymm1
|
||||
vmovdqu8 %ymm1,(%rsi,%r9){%k1}
|
||||
|
||||
jmp .Ldone8
|
||||
|
||||
ENDPROC(chacha_8block_xor_avx512vl)
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
|
||||
* ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
@@ -10,6 +10,7 @@
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/frame.h>
|
||||
|
||||
.section .rodata.cst16.ROT8, "aM", @progbits, 16
|
||||
.align 16
|
||||
@@ -23,35 +24,25 @@ CTRINC: .octa 0x00000003000000020000000100000000
|
||||
|
||||
.text
|
||||
|
||||
ENTRY(chacha20_block_xor_ssse3)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: 1 data block output, o
|
||||
# %rdx: 1 data block input, i
|
||||
|
||||
# This function encrypts one ChaCha20 block by loading the state matrix
|
||||
# in four SSE registers. It performs matrix operation on four words in
|
||||
# parallel, but requireds shuffling to rearrange the words after each
|
||||
# round. 8/16-bit word rotation is done with the slightly better
|
||||
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
|
||||
# traditional shift+OR.
|
||||
|
||||
# x0..3 = s0..3
|
||||
movdqa 0x00(%rdi),%xmm0
|
||||
movdqa 0x10(%rdi),%xmm1
|
||||
movdqa 0x20(%rdi),%xmm2
|
||||
movdqa 0x30(%rdi),%xmm3
|
||||
movdqa %xmm0,%xmm8
|
||||
movdqa %xmm1,%xmm9
|
||||
movdqa %xmm2,%xmm10
|
||||
movdqa %xmm3,%xmm11
|
||||
/*
|
||||
* chacha_permute - permute one block
|
||||
*
|
||||
* Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
|
||||
* function performs matrix operations on four words in parallel, but requires
|
||||
* shuffling to rearrange the words after each round. 8/16-bit word rotation is
|
||||
* done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
|
||||
* rotation uses traditional shift+OR.
|
||||
*
|
||||
* The round count is given in %r8d.
|
||||
*
|
||||
* Clobbers: %r8d, %xmm4-%xmm7
|
||||
*/
|
||||
chacha_permute:
|
||||
|
||||
movdqa ROT8(%rip),%xmm4
|
||||
movdqa ROT16(%rip),%xmm5
|
||||
|
||||
mov $10,%ecx
|
||||
|
||||
.Ldoubleround:
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
paddd %xmm1,%xmm0
|
||||
pxor %xmm0,%xmm3
|
||||
@@ -118,39 +109,129 @@ ENTRY(chacha20_block_xor_ssse3)
|
||||
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||
pshufd $0x39,%xmm3,%xmm3
|
||||
|
||||
dec %ecx
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround
|
||||
|
||||
ret
|
||||
ENDPROC(chacha_permute)
|
||||
|
||||
ENTRY(chacha_block_xor_ssse3)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 1 data block output, o
|
||||
# %rdx: up to 1 data block input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
FRAME_BEGIN
|
||||
|
||||
# x0..3 = s0..3
|
||||
movdqu 0x00(%rdi),%xmm0
|
||||
movdqu 0x10(%rdi),%xmm1
|
||||
movdqu 0x20(%rdi),%xmm2
|
||||
movdqu 0x30(%rdi),%xmm3
|
||||
movdqa %xmm0,%xmm8
|
||||
movdqa %xmm1,%xmm9
|
||||
movdqa %xmm2,%xmm10
|
||||
movdqa %xmm3,%xmm11
|
||||
|
||||
mov %rcx,%rax
|
||||
call chacha_permute
|
||||
|
||||
# o0 = i0 ^ (x0 + s0)
|
||||
movdqu 0x00(%rdx),%xmm4
|
||||
paddd %xmm8,%xmm0
|
||||
cmp $0x10,%rax
|
||||
jl .Lxorpart
|
||||
movdqu 0x00(%rdx),%xmm4
|
||||
pxor %xmm4,%xmm0
|
||||
movdqu %xmm0,0x00(%rsi)
|
||||
# o1 = i1 ^ (x1 + s1)
|
||||
movdqu 0x10(%rdx),%xmm5
|
||||
paddd %xmm9,%xmm1
|
||||
pxor %xmm5,%xmm1
|
||||
movdqu %xmm1,0x10(%rsi)
|
||||
movdqa %xmm1,%xmm0
|
||||
cmp $0x20,%rax
|
||||
jl .Lxorpart
|
||||
movdqu 0x10(%rdx),%xmm0
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x10(%rsi)
|
||||
# o2 = i2 ^ (x2 + s2)
|
||||
movdqu 0x20(%rdx),%xmm6
|
||||
paddd %xmm10,%xmm2
|
||||
pxor %xmm6,%xmm2
|
||||
movdqu %xmm2,0x20(%rsi)
|
||||
movdqa %xmm2,%xmm0
|
||||
cmp $0x30,%rax
|
||||
jl .Lxorpart
|
||||
movdqu 0x20(%rdx),%xmm0
|
||||
pxor %xmm2,%xmm0
|
||||
movdqu %xmm0,0x20(%rsi)
|
||||
# o3 = i3 ^ (x3 + s3)
|
||||
movdqu 0x30(%rdx),%xmm7
|
||||
paddd %xmm11,%xmm3
|
||||
pxor %xmm7,%xmm3
|
||||
movdqu %xmm3,0x30(%rsi)
|
||||
movdqa %xmm3,%xmm0
|
||||
cmp $0x40,%rax
|
||||
jl .Lxorpart
|
||||
movdqu 0x30(%rdx),%xmm0
|
||||
pxor %xmm3,%xmm0
|
||||
movdqu %xmm0,0x30(%rsi)
|
||||
|
||||
.Ldone:
|
||||
FRAME_END
|
||||
ret
|
||||
ENDPROC(chacha20_block_xor_ssse3)
|
||||
|
||||
ENTRY(chacha20_4block_xor_ssse3)
|
||||
.Lxorpart:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rax,%r9
|
||||
and $0x0f,%r9
|
||||
jz .Ldone
|
||||
and $~0x0f,%rax
|
||||
|
||||
mov %rsi,%r11
|
||||
|
||||
lea 8(%rsp),%r10
|
||||
sub $0x10,%rsp
|
||||
and $~31,%rsp
|
||||
|
||||
lea (%rdx,%rax),%rsi
|
||||
mov %rsp,%rdi
|
||||
mov %r9,%rcx
|
||||
rep movsb
|
||||
|
||||
pxor 0x00(%rsp),%xmm0
|
||||
movdqa %xmm0,0x00(%rsp)
|
||||
|
||||
mov %rsp,%rsi
|
||||
lea (%r11,%rax),%rdi
|
||||
mov %r9,%rcx
|
||||
rep movsb
|
||||
|
||||
lea -8(%r10),%rsp
|
||||
jmp .Ldone
|
||||
|
||||
ENDPROC(chacha_block_xor_ssse3)
|
||||
|
||||
ENTRY(hchacha_block_ssse3)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: 4 data blocks output, o
|
||||
# %rdx: 4 data blocks input, i
|
||||
# %rsi: output (8 32-bit words)
|
||||
# %edx: nrounds
|
||||
FRAME_BEGIN
|
||||
|
||||
# This function encrypts four consecutive ChaCha20 blocks by loading the
|
||||
movdqu 0x00(%rdi),%xmm0
|
||||
movdqu 0x10(%rdi),%xmm1
|
||||
movdqu 0x20(%rdi),%xmm2
|
||||
movdqu 0x30(%rdi),%xmm3
|
||||
|
||||
mov %edx,%r8d
|
||||
call chacha_permute
|
||||
|
||||
movdqu %xmm0,0x00(%rsi)
|
||||
movdqu %xmm3,0x10(%rsi)
|
||||
|
||||
FRAME_END
|
||||
ret
|
||||
ENDPROC(hchacha_block_ssse3)
|
||||
|
||||
ENTRY(chacha_4block_xor_ssse3)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 4 data blocks output, o
|
||||
# %rdx: up to 4 data blocks input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
|
||||
# This function encrypts four consecutive ChaCha blocks by loading the
|
||||
# the state matrix in SSE registers four times. As we need some scratch
|
||||
# registers, we save the first four registers on the stack. The
|
||||
# algorithm performs each operation on the corresponding word of each
|
||||
@@ -163,6 +244,7 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||
lea 8(%rsp),%r10
|
||||
sub $0x80,%rsp
|
||||
and $~63,%rsp
|
||||
mov %rcx,%rax
|
||||
|
||||
# x0..15[0-3] = s0..3[0..3]
|
||||
movq 0x00(%rdi),%xmm1
|
||||
@@ -202,8 +284,6 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||
# x12 += counter values 0-3
|
||||
paddd %xmm1,%xmm12
|
||||
|
||||
mov $10,%ecx
|
||||
|
||||
.Ldoubleround4:
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||
movdqa 0x00(%rsp),%xmm0
|
||||
@@ -421,7 +501,7 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||
psrld $25,%xmm4
|
||||
por %xmm0,%xmm4
|
||||
|
||||
dec %ecx
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround4
|
||||
|
||||
# x0[0-3] += s0[0]
|
||||
@@ -573,58 +653,143 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||
|
||||
# xor with corresponding input, write to output
|
||||
movdqa 0x00(%rsp),%xmm0
|
||||
cmp $0x10,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x00(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x00(%rsi)
|
||||
movdqa 0x10(%rsp),%xmm0
|
||||
movdqu 0x80(%rdx),%xmm1
|
||||
|
||||
movdqu %xmm4,%xmm0
|
||||
cmp $0x20,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x10(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x80(%rsi)
|
||||
movdqu %xmm0,0x10(%rsi)
|
||||
|
||||
movdqu %xmm8,%xmm0
|
||||
cmp $0x30,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x20(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x20(%rsi)
|
||||
|
||||
movdqu %xmm12,%xmm0
|
||||
cmp $0x40,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x30(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x30(%rsi)
|
||||
|
||||
movdqa 0x20(%rsp),%xmm0
|
||||
cmp $0x50,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x40(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x40(%rsi)
|
||||
|
||||
movdqu %xmm6,%xmm0
|
||||
cmp $0x60,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x50(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x50(%rsi)
|
||||
|
||||
movdqu %xmm10,%xmm0
|
||||
cmp $0x70,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x60(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x60(%rsi)
|
||||
|
||||
movdqu %xmm14,%xmm0
|
||||
cmp $0x80,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x70(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x70(%rsi)
|
||||
|
||||
movdqa 0x10(%rsp),%xmm0
|
||||
cmp $0x90,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x80(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x80(%rsi)
|
||||
|
||||
movdqu %xmm5,%xmm0
|
||||
cmp $0xa0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x90(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x90(%rsi)
|
||||
|
||||
movdqu %xmm9,%xmm0
|
||||
cmp $0xb0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xa0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xa0(%rsi)
|
||||
|
||||
movdqu %xmm13,%xmm0
|
||||
cmp $0xc0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xb0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xb0(%rsi)
|
||||
|
||||
movdqa 0x30(%rsp),%xmm0
|
||||
cmp $0xd0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xc0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xc0(%rsi)
|
||||
movdqu 0x10(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm4
|
||||
movdqu %xmm4,0x10(%rsi)
|
||||
movdqu 0x90(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm5
|
||||
movdqu %xmm5,0x90(%rsi)
|
||||
movdqu 0x50(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm6
|
||||
movdqu %xmm6,0x50(%rsi)
|
||||
movdqu 0xd0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm7
|
||||
movdqu %xmm7,0xd0(%rsi)
|
||||
movdqu 0x20(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm8
|
||||
movdqu %xmm8,0x20(%rsi)
|
||||
movdqu 0xa0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm9
|
||||
movdqu %xmm9,0xa0(%rsi)
|
||||
movdqu 0x60(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm10
|
||||
movdqu %xmm10,0x60(%rsi)
|
||||
movdqu 0xe0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm11
|
||||
movdqu %xmm11,0xe0(%rsi)
|
||||
movdqu 0x30(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm12
|
||||
movdqu %xmm12,0x30(%rsi)
|
||||
movdqu 0xb0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm13
|
||||
movdqu %xmm13,0xb0(%rsi)
|
||||
movdqu 0x70(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm14
|
||||
movdqu %xmm14,0x70(%rsi)
|
||||
movdqu 0xf0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm15
|
||||
movdqu %xmm15,0xf0(%rsi)
|
||||
|
||||
movdqu %xmm7,%xmm0
|
||||
cmp $0xe0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xd0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xd0(%rsi)
|
||||
|
||||
movdqu %xmm11,%xmm0
|
||||
cmp $0xf0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xe0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xe0(%rsi)
|
||||
|
||||
movdqu %xmm15,%xmm0
|
||||
cmp $0x100,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xf0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xf0(%rsi)
|
||||
|
||||
.Ldone4:
|
||||
lea -8(%r10),%rsp
|
||||
ret
|
||||
ENDPROC(chacha20_4block_xor_ssse3)
|
||||
|
||||
.Lxorpart4:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rax,%r9
|
||||
and $0x0f,%r9
|
||||
jz .Ldone4
|
||||
and $~0x0f,%rax
|
||||
|
||||
mov %rsi,%r11
|
||||
|
||||
lea (%rdx,%rax),%rsi
|
||||
mov %rsp,%rdi
|
||||
mov %r9,%rcx
|
||||
rep movsb
|
||||
|
||||
pxor 0x00(%rsp),%xmm0
|
||||
movdqa %xmm0,0x00(%rsp)
|
||||
|
||||
mov %rsp,%rsi
|
||||
lea (%r11,%rax),%rdi
|
||||
mov %r9,%rcx
|
||||
rep movsb
|
||||
|
||||
jmp .Ldone4
|
||||
|
||||
ENDPROC(chacha_4block_xor_ssse3)
|
||||
@@ -1,448 +0,0 @@
|
||||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.section .rodata.cst32.ROT8, "aM", @progbits, 32
|
||||
.align 32
|
||||
ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
|
||||
.octa 0x0e0d0c0f0a09080b0605040702010003
|
||||
|
||||
.section .rodata.cst32.ROT16, "aM", @progbits, 32
|
||||
.align 32
|
||||
ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
|
||||
.octa 0x0d0c0f0e09080b0a0504070601000302
|
||||
|
||||
.section .rodata.cst32.CTRINC, "aM", @progbits, 32
|
||||
.align 32
|
||||
CTRINC: .octa 0x00000003000000020000000100000000
|
||||
.octa 0x00000007000000060000000500000004
|
||||
|
||||
.text
|
||||
|
||||
ENTRY(chacha20_8block_xor_avx2)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: 8 data blocks output, o
|
||||
# %rdx: 8 data blocks input, i
|
||||
|
||||
# This function encrypts eight consecutive ChaCha20 blocks by loading
|
||||
# the state matrix in AVX registers eight times. As we need some
|
||||
# scratch registers, we save the first four registers on the stack. The
|
||||
# algorithm performs each operation on the corresponding word of each
|
||||
# state matrix, hence requires no word shuffling. For final XORing step
|
||||
# we transpose the matrix by interleaving 32-, 64- and then 128-bit
|
||||
# words, which allows us to do XOR in AVX registers. 8/16-bit word
|
||||
# rotation is done with the slightly better performing byte shuffling,
|
||||
# 7/12-bit word rotation uses traditional shift+OR.
|
||||
|
||||
vzeroupper
|
||||
# 4 * 32 byte stack, 32-byte aligned
|
||||
lea 8(%rsp),%r10
|
||||
and $~31, %rsp
|
||||
sub $0x80, %rsp
|
||||
|
||||
# x0..15[0-7] = s[0..15]
|
||||
vpbroadcastd 0x00(%rdi),%ymm0
|
||||
vpbroadcastd 0x04(%rdi),%ymm1
|
||||
vpbroadcastd 0x08(%rdi),%ymm2
|
||||
vpbroadcastd 0x0c(%rdi),%ymm3
|
||||
vpbroadcastd 0x10(%rdi),%ymm4
|
||||
vpbroadcastd 0x14(%rdi),%ymm5
|
||||
vpbroadcastd 0x18(%rdi),%ymm6
|
||||
vpbroadcastd 0x1c(%rdi),%ymm7
|
||||
vpbroadcastd 0x20(%rdi),%ymm8
|
||||
vpbroadcastd 0x24(%rdi),%ymm9
|
||||
vpbroadcastd 0x28(%rdi),%ymm10
|
||||
vpbroadcastd 0x2c(%rdi),%ymm11
|
||||
vpbroadcastd 0x30(%rdi),%ymm12
|
||||
vpbroadcastd 0x34(%rdi),%ymm13
|
||||
vpbroadcastd 0x38(%rdi),%ymm14
|
||||
vpbroadcastd 0x3c(%rdi),%ymm15
|
||||
# x0..3 on stack
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa %ymm2,0x40(%rsp)
|
||||
vmovdqa %ymm3,0x60(%rsp)
|
||||
|
||||
vmovdqa CTRINC(%rip),%ymm1
|
||||
vmovdqa ROT8(%rip),%ymm2
|
||||
vmovdqa ROT16(%rip),%ymm3
|
||||
|
||||
# x12 += counter values 0-3
|
||||
vpaddd %ymm1,%ymm12,%ymm12
|
||||
|
||||
mov $10,%ecx
|
||||
|
||||
.Ldoubleround8:
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||
vpaddd 0x00(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm3,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
|
||||
vpaddd 0x20(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm3,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
|
||||
vpaddd 0x40(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm3,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
|
||||
vpaddd 0x60(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm3,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm4,%ymm4
|
||||
vpslld $12,%ymm4,%ymm0
|
||||
vpsrld $20,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm5,%ymm5
|
||||
vpslld $12,%ymm5,%ymm0
|
||||
vpsrld $20,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm6,%ymm6
|
||||
vpslld $12,%ymm6,%ymm0
|
||||
vpsrld $20,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm7,%ymm7
|
||||
vpslld $12,%ymm7,%ymm0
|
||||
vpsrld $20,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
|
||||
vpaddd 0x00(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm2,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
|
||||
vpaddd 0x20(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm2,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
|
||||
vpaddd 0x40(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm2,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
|
||||
vpaddd 0x60(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm2,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm4,%ymm4
|
||||
vpslld $7,%ymm4,%ymm0
|
||||
vpsrld $25,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm5,%ymm5
|
||||
vpslld $7,%ymm5,%ymm0
|
||||
vpsrld $25,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm6,%ymm6
|
||||
vpslld $7,%ymm6,%ymm0
|
||||
vpsrld $25,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm7,%ymm7
|
||||
vpslld $7,%ymm7,%ymm0
|
||||
vpsrld $25,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
|
||||
vpaddd 0x00(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm3,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
|
||||
vpaddd 0x20(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm3,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
|
||||
vpaddd 0x40(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm3,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
|
||||
vpaddd 0x60(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm3,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm5,%ymm5
|
||||
vpslld $12,%ymm5,%ymm0
|
||||
vpsrld $20,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm6,%ymm6
|
||||
vpslld $12,%ymm6,%ymm0
|
||||
vpsrld $20,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm7,%ymm7
|
||||
vpslld $12,%ymm7,%ymm0
|
||||
vpsrld $20,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm4,%ymm4
|
||||
vpslld $12,%ymm4,%ymm0
|
||||
vpsrld $20,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
|
||||
vpaddd 0x00(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm2,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
|
||||
vpaddd 0x20(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm2,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
|
||||
vpaddd 0x40(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm2,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
|
||||
vpaddd 0x60(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm2,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm5,%ymm5
|
||||
vpslld $7,%ymm5,%ymm0
|
||||
vpsrld $25,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm6,%ymm6
|
||||
vpslld $7,%ymm6,%ymm0
|
||||
vpsrld $25,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm7,%ymm7
|
||||
vpslld $7,%ymm7,%ymm0
|
||||
vpsrld $25,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm4,%ymm4
|
||||
vpslld $7,%ymm4,%ymm0
|
||||
vpsrld $25,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
|
||||
dec %ecx
|
||||
jnz .Ldoubleround8
|
||||
|
||||
# x0..15[0-3] += s[0..15]
|
||||
vpbroadcastd 0x00(%rdi),%ymm0
|
||||
vpaddd 0x00(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpbroadcastd 0x04(%rdi),%ymm0
|
||||
vpaddd 0x20(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpbroadcastd 0x08(%rdi),%ymm0
|
||||
vpaddd 0x40(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpbroadcastd 0x0c(%rdi),%ymm0
|
||||
vpaddd 0x60(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpbroadcastd 0x10(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm4,%ymm4
|
||||
vpbroadcastd 0x14(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm5,%ymm5
|
||||
vpbroadcastd 0x18(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm6,%ymm6
|
||||
vpbroadcastd 0x1c(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm7,%ymm7
|
||||
vpbroadcastd 0x20(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm8,%ymm8
|
||||
vpbroadcastd 0x24(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm9,%ymm9
|
||||
vpbroadcastd 0x28(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm10,%ymm10
|
||||
vpbroadcastd 0x2c(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm11,%ymm11
|
||||
vpbroadcastd 0x30(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm12,%ymm12
|
||||
vpbroadcastd 0x34(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm13,%ymm13
|
||||
vpbroadcastd 0x38(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm14,%ymm14
|
||||
vpbroadcastd 0x3c(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm15,%ymm15
|
||||
|
||||
# x12 += counter values 0-3
|
||||
vpaddd %ymm1,%ymm12,%ymm12
|
||||
|
||||
# interleave 32-bit words in state n, n+1
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vmovdqa 0x20(%rsp),%ymm1
|
||||
vpunpckldq %ymm1,%ymm0,%ymm2
|
||||
vpunpckhdq %ymm1,%ymm0,%ymm1
|
||||
vmovdqa %ymm2,0x00(%rsp)
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa 0x40(%rsp),%ymm0
|
||||
vmovdqa 0x60(%rsp),%ymm1
|
||||
vpunpckldq %ymm1,%ymm0,%ymm2
|
||||
vpunpckhdq %ymm1,%ymm0,%ymm1
|
||||
vmovdqa %ymm2,0x40(%rsp)
|
||||
vmovdqa %ymm1,0x60(%rsp)
|
||||
vmovdqa %ymm4,%ymm0
|
||||
vpunpckldq %ymm5,%ymm0,%ymm4
|
||||
vpunpckhdq %ymm5,%ymm0,%ymm5
|
||||
vmovdqa %ymm6,%ymm0
|
||||
vpunpckldq %ymm7,%ymm0,%ymm6
|
||||
vpunpckhdq %ymm7,%ymm0,%ymm7
|
||||
vmovdqa %ymm8,%ymm0
|
||||
vpunpckldq %ymm9,%ymm0,%ymm8
|
||||
vpunpckhdq %ymm9,%ymm0,%ymm9
|
||||
vmovdqa %ymm10,%ymm0
|
||||
vpunpckldq %ymm11,%ymm0,%ymm10
|
||||
vpunpckhdq %ymm11,%ymm0,%ymm11
|
||||
vmovdqa %ymm12,%ymm0
|
||||
vpunpckldq %ymm13,%ymm0,%ymm12
|
||||
vpunpckhdq %ymm13,%ymm0,%ymm13
|
||||
vmovdqa %ymm14,%ymm0
|
||||
vpunpckldq %ymm15,%ymm0,%ymm14
|
||||
vpunpckhdq %ymm15,%ymm0,%ymm15
|
||||
|
||||
# interleave 64-bit words in state n, n+2
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vmovdqa 0x40(%rsp),%ymm2
|
||||
vpunpcklqdq %ymm2,%ymm0,%ymm1
|
||||
vpunpckhqdq %ymm2,%ymm0,%ymm2
|
||||
vmovdqa %ymm1,0x00(%rsp)
|
||||
vmovdqa %ymm2,0x40(%rsp)
|
||||
vmovdqa 0x20(%rsp),%ymm0
|
||||
vmovdqa 0x60(%rsp),%ymm2
|
||||
vpunpcklqdq %ymm2,%ymm0,%ymm1
|
||||
vpunpckhqdq %ymm2,%ymm0,%ymm2
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa %ymm2,0x60(%rsp)
|
||||
vmovdqa %ymm4,%ymm0
|
||||
vpunpcklqdq %ymm6,%ymm0,%ymm4
|
||||
vpunpckhqdq %ymm6,%ymm0,%ymm6
|
||||
vmovdqa %ymm5,%ymm0
|
||||
vpunpcklqdq %ymm7,%ymm0,%ymm5
|
||||
vpunpckhqdq %ymm7,%ymm0,%ymm7
|
||||
vmovdqa %ymm8,%ymm0
|
||||
vpunpcklqdq %ymm10,%ymm0,%ymm8
|
||||
vpunpckhqdq %ymm10,%ymm0,%ymm10
|
||||
vmovdqa %ymm9,%ymm0
|
||||
vpunpcklqdq %ymm11,%ymm0,%ymm9
|
||||
vpunpckhqdq %ymm11,%ymm0,%ymm11
|
||||
vmovdqa %ymm12,%ymm0
|
||||
vpunpcklqdq %ymm14,%ymm0,%ymm12
|
||||
vpunpckhqdq %ymm14,%ymm0,%ymm14
|
||||
vmovdqa %ymm13,%ymm0
|
||||
vpunpcklqdq %ymm15,%ymm0,%ymm13
|
||||
vpunpckhqdq %ymm15,%ymm0,%ymm15
|
||||
|
||||
# interleave 128-bit words in state n, n+4
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
|
||||
vmovdqa %ymm1,0x00(%rsp)
|
||||
vmovdqa 0x20(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa 0x40(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
|
||||
vmovdqa %ymm1,0x40(%rsp)
|
||||
vmovdqa 0x60(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
|
||||
vmovdqa %ymm1,0x60(%rsp)
|
||||
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
|
||||
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
|
||||
vmovdqa %ymm0,%ymm8
|
||||
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
|
||||
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
|
||||
vmovdqa %ymm0,%ymm9
|
||||
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
|
||||
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
|
||||
vmovdqa %ymm0,%ymm10
|
||||
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
|
||||
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
|
||||
vmovdqa %ymm0,%ymm11
|
||||
|
||||
# xor with corresponding input, write to output
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vpxor 0x0000(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x0000(%rsi)
|
||||
vmovdqa 0x20(%rsp),%ymm0
|
||||
vpxor 0x0080(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x0080(%rsi)
|
||||
vmovdqa 0x40(%rsp),%ymm0
|
||||
vpxor 0x0040(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x0040(%rsi)
|
||||
vmovdqa 0x60(%rsp),%ymm0
|
||||
vpxor 0x00c0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x00c0(%rsi)
|
||||
vpxor 0x0100(%rdx),%ymm4,%ymm4
|
||||
vmovdqu %ymm4,0x0100(%rsi)
|
||||
vpxor 0x0180(%rdx),%ymm5,%ymm5
|
||||
vmovdqu %ymm5,0x00180(%rsi)
|
||||
vpxor 0x0140(%rdx),%ymm6,%ymm6
|
||||
vmovdqu %ymm6,0x0140(%rsi)
|
||||
vpxor 0x01c0(%rdx),%ymm7,%ymm7
|
||||
vmovdqu %ymm7,0x01c0(%rsi)
|
||||
vpxor 0x0020(%rdx),%ymm8,%ymm8
|
||||
vmovdqu %ymm8,0x0020(%rsi)
|
||||
vpxor 0x00a0(%rdx),%ymm9,%ymm9
|
||||
vmovdqu %ymm9,0x00a0(%rsi)
|
||||
vpxor 0x0060(%rdx),%ymm10,%ymm10
|
||||
vmovdqu %ymm10,0x0060(%rsi)
|
||||
vpxor 0x00e0(%rdx),%ymm11,%ymm11
|
||||
vmovdqu %ymm11,0x00e0(%rsi)
|
||||
vpxor 0x0120(%rdx),%ymm12,%ymm12
|
||||
vmovdqu %ymm12,0x0120(%rsi)
|
||||
vpxor 0x01a0(%rdx),%ymm13,%ymm13
|
||||
vmovdqu %ymm13,0x01a0(%rsi)
|
||||
vpxor 0x0160(%rdx),%ymm14,%ymm14
|
||||
vmovdqu %ymm14,0x0160(%rsi)
|
||||
vpxor 0x01e0(%rdx),%ymm15,%ymm15
|
||||
vmovdqu %ymm15,0x01e0(%rsi)
|
||||
|
||||
vzeroupper
|
||||
lea -8(%r10),%rsp
|
||||
ret
|
||||
ENDPROC(chacha20_8block_xor_avx2)
|
||||
@@ -1,146 +0,0 @@
|
||||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/fpu/api.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
#define CHACHA20_STATE_ALIGN 16
|
||||
|
||||
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
|
||||
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
|
||||
static bool chacha20_use_avx2;
|
||||
#endif
|
||||
|
||||
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes)
|
||||
{
|
||||
u8 buf[CHACHA_BLOCK_SIZE];
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
if (chacha20_use_avx2) {
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
|
||||
chacha20_8block_xor_avx2(state, dst, src);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 8;
|
||||
src += CHACHA_BLOCK_SIZE * 8;
|
||||
dst += CHACHA_BLOCK_SIZE * 8;
|
||||
state[12] += 8;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha20_4block_xor_ssse3(state, dst, src);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||
src += CHACHA_BLOCK_SIZE * 4;
|
||||
dst += CHACHA_BLOCK_SIZE * 4;
|
||||
state[12] += 4;
|
||||
}
|
||||
while (bytes >= CHACHA_BLOCK_SIZE) {
|
||||
chacha20_block_xor_ssse3(state, dst, src);
|
||||
bytes -= CHACHA_BLOCK_SIZE;
|
||||
src += CHACHA_BLOCK_SIZE;
|
||||
dst += CHACHA_BLOCK_SIZE;
|
||||
state[12]++;
|
||||
}
|
||||
if (bytes) {
|
||||
memcpy(buf, src, bytes);
|
||||
chacha20_block_xor_ssse3(state, buf, buf);
|
||||
memcpy(dst, buf, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
static int chacha20_simd(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
u32 *state, state_buf[16 + 2] __aligned(8);
|
||||
struct skcipher_walk walk;
|
||||
int err;
|
||||
|
||||
BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
|
||||
state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
|
||||
|
||||
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
|
||||
return crypto_chacha_crypt(req);
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, true);
|
||||
|
||||
crypto_chacha_init(state, ctx, walk.iv);
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
while (walk.nbytes >= CHACHA_BLOCK_SIZE) {
|
||||
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
rounddown(walk.nbytes, CHACHA_BLOCK_SIZE));
|
||||
err = skcipher_walk_done(&walk,
|
||||
walk.nbytes % CHACHA_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
if (walk.nbytes) {
|
||||
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
walk.nbytes);
|
||||
err = skcipher_walk_done(&walk, 0);
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct skcipher_alg alg = {
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-simd",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.encrypt = chacha20_simd,
|
||||
.decrypt = chacha20_simd,
|
||||
};
|
||||
|
||||
static int __init chacha20_simd_mod_init(void)
|
||||
{
|
||||
if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
return -ENODEV;
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX2) &&
|
||||
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
|
||||
#endif
|
||||
return crypto_register_skcipher(&alg);
|
||||
}
|
||||
|
||||
static void __exit chacha20_simd_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_skcipher(&alg);
|
||||
}
|
||||
|
||||
module_init(chacha20_simd_mod_init);
|
||||
module_exit(chacha20_simd_mod_fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
|
||||
MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
|
||||
MODULE_ALIAS_CRYPTO("chacha20");
|
||||
MODULE_ALIAS_CRYPTO("chacha20-simd");
|
||||
322
arch/x86/crypto/chacha_glue.c
Normal file
322
arch/x86/crypto/chacha_glue.c
Normal file
@@ -0,0 +1,322 @@
|
||||
/*
|
||||
* x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
|
||||
* including ChaCha20 (RFC7539)
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/fpu/api.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
|
||||
|
||||
asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
|
||||
asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
|
||||
|
||||
static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
|
||||
{
|
||||
len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
|
||||
return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes, int nrounds)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
||||
static_branch_likely(&chacha_use_avx512vl)) {
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
|
||||
chacha_8block_xor_avx512vl(state, dst, src, bytes,
|
||||
nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 8;
|
||||
src += CHACHA_BLOCK_SIZE * 8;
|
||||
dst += CHACHA_BLOCK_SIZE * 8;
|
||||
state[12] += 8;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha_8block_xor_avx512vl(state, dst, src, bytes,
|
||||
nrounds);
|
||||
state[12] += chacha_advance(bytes, 8);
|
||||
return;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE * 2) {
|
||||
chacha_4block_xor_avx512vl(state, dst, src, bytes,
|
||||
nrounds);
|
||||
state[12] += chacha_advance(bytes, 4);
|
||||
return;
|
||||
}
|
||||
if (bytes) {
|
||||
chacha_2block_xor_avx512vl(state, dst, src, bytes,
|
||||
nrounds);
|
||||
state[12] += chacha_advance(bytes, 2);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (IS_ENABLED(CONFIG_AS_AVX2) &&
|
||||
static_branch_likely(&chacha_use_avx2)) {
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
|
||||
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 8;
|
||||
src += CHACHA_BLOCK_SIZE * 8;
|
||||
dst += CHACHA_BLOCK_SIZE * 8;
|
||||
state[12] += 8;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||
state[12] += chacha_advance(bytes, 8);
|
||||
return;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE * 2) {
|
||||
chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||
state[12] += chacha_advance(bytes, 4);
|
||||
return;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE) {
|
||||
chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||
state[12] += chacha_advance(bytes, 2);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||
src += CHACHA_BLOCK_SIZE * 4;
|
||||
dst += CHACHA_BLOCK_SIZE * 4;
|
||||
state[12] += 4;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE) {
|
||||
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
|
||||
state[12] += chacha_advance(bytes, 4);
|
||||
return;
|
||||
}
|
||||
if (bytes) {
|
||||
chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
|
||||
state[12]++;
|
||||
}
|
||||
}
|
||||
|
||||
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
|
||||
{
|
||||
if (!static_branch_likely(&chacha_use_simd) || !may_use_simd()) {
|
||||
hchacha_block_generic(state, stream, nrounds);
|
||||
} else {
|
||||
kernel_fpu_begin();
|
||||
hchacha_block_ssse3(state, stream, nrounds);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(hchacha_block_arch);
|
||||
|
||||
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
||||
{
|
||||
chacha_init_generic(state, key, iv);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_init_arch);
|
||||
|
||||
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
|
||||
int nrounds)
|
||||
{
|
||||
if (!static_branch_likely(&chacha_use_simd) || !may_use_simd() ||
|
||||
bytes <= CHACHA_BLOCK_SIZE)
|
||||
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
|
||||
|
||||
do {
|
||||
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
|
||||
|
||||
kernel_fpu_begin();
|
||||
chacha_dosimd(state, dst, src, todo, nrounds);
|
||||
kernel_fpu_end();
|
||||
|
||||
bytes -= todo;
|
||||
src += todo;
|
||||
dst += todo;
|
||||
} while (bytes);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_crypt_arch);
|
||||
|
||||
static int chacha_simd_stream_xor(struct skcipher_request *req,
|
||||
const struct chacha_ctx *ctx, const u8 *iv)
|
||||
{
|
||||
u32 state[CHACHA_STATE_WORDS] __aligned(8);
|
||||
struct skcipher_walk walk;
|
||||
int err;
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
chacha_init_generic(state, ctx->key, iv);
|
||||
|
||||
while (walk.nbytes > 0) {
|
||||
unsigned int nbytes = walk.nbytes;
|
||||
|
||||
if (nbytes < walk.total)
|
||||
nbytes = round_down(nbytes, walk.stride);
|
||||
|
||||
if (!static_branch_likely(&chacha_use_simd) ||
|
||||
!may_use_simd()) {
|
||||
chacha_crypt_generic(state, walk.dst.virt.addr,
|
||||
walk.src.virt.addr, nbytes,
|
||||
ctx->nrounds);
|
||||
} else {
|
||||
kernel_fpu_begin();
|
||||
chacha_dosimd(state, walk.dst.virt.addr,
|
||||
walk.src.virt.addr, nbytes,
|
||||
ctx->nrounds);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int chacha_simd(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
|
||||
return chacha_simd_stream_xor(req, ctx, req->iv);
|
||||
}
|
||||
|
||||
static int xchacha_simd(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
u32 state[CHACHA_STATE_WORDS] __aligned(8);
|
||||
struct chacha_ctx subctx;
|
||||
u8 real_iv[16];
|
||||
|
||||
chacha_init_generic(state, ctx->key, req->iv);
|
||||
|
||||
if (req->cryptlen > CHACHA_BLOCK_SIZE && irq_fpu_usable()) {
|
||||
kernel_fpu_begin();
|
||||
hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
|
||||
kernel_fpu_end();
|
||||
} else {
|
||||
hchacha_block_generic(state, subctx.key, ctx->nrounds);
|
||||
}
|
||||
subctx.nrounds = ctx->nrounds;
|
||||
|
||||
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||
memcpy(&real_iv[8], req->iv + 16, 8);
|
||||
return chacha_simd_stream_xor(req, &subctx, real_iv);
|
||||
}
|
||||
|
||||
static struct skcipher_alg algs[] = {
|
||||
{
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-simd",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = chacha_simd,
|
||||
.decrypt = chacha_simd,
|
||||
}, {
|
||||
.base.cra_name = "xchacha20",
|
||||
.base.cra_driver_name = "xchacha20-simd",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = xchacha_simd,
|
||||
.decrypt = xchacha_simd,
|
||||
}, {
|
||||
.base.cra_name = "xchacha12",
|
||||
.base.cra_driver_name = "xchacha12-simd",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha12_setkey,
|
||||
.encrypt = xchacha_simd,
|
||||
.decrypt = xchacha_simd,
|
||||
},
|
||||
};
|
||||
|
||||
static int __init chacha_simd_mod_init(void)
|
||||
{
|
||||
if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
return 0;
|
||||
|
||||
static_branch_enable(&chacha_use_simd);
|
||||
|
||||
if (IS_ENABLED(CONFIG_AS_AVX2) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX2) &&
|
||||
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
|
||||
static_branch_enable(&chacha_use_avx2);
|
||||
|
||||
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX512VL) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
|
||||
static_branch_enable(&chacha_use_avx512vl);
|
||||
}
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
|
||||
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
|
||||
}
|
||||
|
||||
static void __exit chacha_simd_mod_fini(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
||||
}
|
||||
|
||||
module_init(chacha_simd_mod_init);
|
||||
module_exit(chacha_simd_mod_fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
|
||||
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
|
||||
MODULE_ALIAS_CRYPTO("chacha20");
|
||||
MODULE_ALIAS_CRYPTO("chacha20-simd");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20-simd");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12-simd");
|
||||
1512
arch/x86/crypto/curve25519-x86_64.c
Normal file
1512
arch/x86/crypto/curve25519-x86_64.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,394 +0,0 @@
|
||||
/*
|
||||
* Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.section .rodata.cst32.ANMASK, "aM", @progbits, 32
|
||||
.align 32
|
||||
ANMASK: .octa 0x0000000003ffffff0000000003ffffff
|
||||
.octa 0x0000000003ffffff0000000003ffffff
|
||||
|
||||
.section .rodata.cst32.ORMASK, "aM", @progbits, 32
|
||||
.align 32
|
||||
ORMASK: .octa 0x00000000010000000000000001000000
|
||||
.octa 0x00000000010000000000000001000000
|
||||
|
||||
.text
|
||||
|
||||
#define h0 0x00(%rdi)
|
||||
#define h1 0x04(%rdi)
|
||||
#define h2 0x08(%rdi)
|
||||
#define h3 0x0c(%rdi)
|
||||
#define h4 0x10(%rdi)
|
||||
#define r0 0x00(%rdx)
|
||||
#define r1 0x04(%rdx)
|
||||
#define r2 0x08(%rdx)
|
||||
#define r3 0x0c(%rdx)
|
||||
#define r4 0x10(%rdx)
|
||||
#define u0 0x00(%r8)
|
||||
#define u1 0x04(%r8)
|
||||
#define u2 0x08(%r8)
|
||||
#define u3 0x0c(%r8)
|
||||
#define u4 0x10(%r8)
|
||||
#define w0 0x14(%r8)
|
||||
#define w1 0x18(%r8)
|
||||
#define w2 0x1c(%r8)
|
||||
#define w3 0x20(%r8)
|
||||
#define w4 0x24(%r8)
|
||||
#define y0 0x28(%r8)
|
||||
#define y1 0x2c(%r8)
|
||||
#define y2 0x30(%r8)
|
||||
#define y3 0x34(%r8)
|
||||
#define y4 0x38(%r8)
|
||||
#define m %rsi
|
||||
#define hc0 %ymm0
|
||||
#define hc1 %ymm1
|
||||
#define hc2 %ymm2
|
||||
#define hc3 %ymm3
|
||||
#define hc4 %ymm4
|
||||
#define hc0x %xmm0
|
||||
#define hc1x %xmm1
|
||||
#define hc2x %xmm2
|
||||
#define hc3x %xmm3
|
||||
#define hc4x %xmm4
|
||||
#define t1 %ymm5
|
||||
#define t2 %ymm6
|
||||
#define t1x %xmm5
|
||||
#define t2x %xmm6
|
||||
#define ruwy0 %ymm7
|
||||
#define ruwy1 %ymm8
|
||||
#define ruwy2 %ymm9
|
||||
#define ruwy3 %ymm10
|
||||
#define ruwy4 %ymm11
|
||||
#define ruwy0x %xmm7
|
||||
#define ruwy1x %xmm8
|
||||
#define ruwy2x %xmm9
|
||||
#define ruwy3x %xmm10
|
||||
#define ruwy4x %xmm11
|
||||
#define svxz1 %ymm12
|
||||
#define svxz2 %ymm13
|
||||
#define svxz3 %ymm14
|
||||
#define svxz4 %ymm15
|
||||
#define d0 %r9
|
||||
#define d1 %r10
|
||||
#define d2 %r11
|
||||
#define d3 %r12
|
||||
#define d4 %r13
|
||||
|
||||
ENTRY(poly1305_4block_avx2)
|
||||
# %rdi: Accumulator h[5]
|
||||
# %rsi: 64 byte input block m
|
||||
# %rdx: Poly1305 key r[5]
|
||||
# %rcx: Quadblock count
|
||||
# %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
|
||||
|
||||
# This four-block variant uses loop unrolled block processing. It
|
||||
# requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
|
||||
# h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
|
||||
|
||||
vzeroupper
|
||||
push %rbx
|
||||
push %r12
|
||||
push %r13
|
||||
|
||||
# combine r0,u0,w0,y0
|
||||
vmovd y0,ruwy0x
|
||||
vmovd w0,t1x
|
||||
vpunpcklqdq t1,ruwy0,ruwy0
|
||||
vmovd u0,t1x
|
||||
vmovd r0,t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,ruwy0,ruwy0
|
||||
|
||||
# combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
|
||||
vmovd y1,ruwy1x
|
||||
vmovd w1,t1x
|
||||
vpunpcklqdq t1,ruwy1,ruwy1
|
||||
vmovd u1,t1x
|
||||
vmovd r1,t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,ruwy1,ruwy1
|
||||
vpslld $2,ruwy1,svxz1
|
||||
vpaddd ruwy1,svxz1,svxz1
|
||||
|
||||
# combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
|
||||
vmovd y2,ruwy2x
|
||||
vmovd w2,t1x
|
||||
vpunpcklqdq t1,ruwy2,ruwy2
|
||||
vmovd u2,t1x
|
||||
vmovd r2,t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,ruwy2,ruwy2
|
||||
vpslld $2,ruwy2,svxz2
|
||||
vpaddd ruwy2,svxz2,svxz2
|
||||
|
||||
# combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
|
||||
vmovd y3,ruwy3x
|
||||
vmovd w3,t1x
|
||||
vpunpcklqdq t1,ruwy3,ruwy3
|
||||
vmovd u3,t1x
|
||||
vmovd r3,t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,ruwy3,ruwy3
|
||||
vpslld $2,ruwy3,svxz3
|
||||
vpaddd ruwy3,svxz3,svxz3
|
||||
|
||||
# combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
|
||||
vmovd y4,ruwy4x
|
||||
vmovd w4,t1x
|
||||
vpunpcklqdq t1,ruwy4,ruwy4
|
||||
vmovd u4,t1x
|
||||
vmovd r4,t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,ruwy4,ruwy4
|
||||
vpslld $2,ruwy4,svxz4
|
||||
vpaddd ruwy4,svxz4,svxz4
|
||||
|
||||
.Ldoblock4:
|
||||
# hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
|
||||
# m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
|
||||
vmovd 0x00(m),hc0x
|
||||
vmovd 0x10(m),t1x
|
||||
vpunpcklqdq t1,hc0,hc0
|
||||
vmovd 0x20(m),t1x
|
||||
vmovd 0x30(m),t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,hc0,hc0
|
||||
vpand ANMASK(%rip),hc0,hc0
|
||||
vmovd h0,t1x
|
||||
vpaddd t1,hc0,hc0
|
||||
# hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
|
||||
# (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
|
||||
vmovd 0x03(m),hc1x
|
||||
vmovd 0x13(m),t1x
|
||||
vpunpcklqdq t1,hc1,hc1
|
||||
vmovd 0x23(m),t1x
|
||||
vmovd 0x33(m),t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,hc1,hc1
|
||||
vpsrld $2,hc1,hc1
|
||||
vpand ANMASK(%rip),hc1,hc1
|
||||
vmovd h1,t1x
|
||||
vpaddd t1,hc1,hc1
|
||||
# hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
|
||||
# (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
|
||||
vmovd 0x06(m),hc2x
|
||||
vmovd 0x16(m),t1x
|
||||
vpunpcklqdq t1,hc2,hc2
|
||||
vmovd 0x26(m),t1x
|
||||
vmovd 0x36(m),t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,hc2,hc2
|
||||
vpsrld $4,hc2,hc2
|
||||
vpand ANMASK(%rip),hc2,hc2
|
||||
vmovd h2,t1x
|
||||
vpaddd t1,hc2,hc2
|
||||
# hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
|
||||
# (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
|
||||
vmovd 0x09(m),hc3x
|
||||
vmovd 0x19(m),t1x
|
||||
vpunpcklqdq t1,hc3,hc3
|
||||
vmovd 0x29(m),t1x
|
||||
vmovd 0x39(m),t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,hc3,hc3
|
||||
vpsrld $6,hc3,hc3
|
||||
vpand ANMASK(%rip),hc3,hc3
|
||||
vmovd h3,t1x
|
||||
vpaddd t1,hc3,hc3
|
||||
# hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
|
||||
# (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
|
||||
vmovd 0x0c(m),hc4x
|
||||
vmovd 0x1c(m),t1x
|
||||
vpunpcklqdq t1,hc4,hc4
|
||||
vmovd 0x2c(m),t1x
|
||||
vmovd 0x3c(m),t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,hc4,hc4
|
||||
vpsrld $8,hc4,hc4
|
||||
vpor ORMASK(%rip),hc4,hc4
|
||||
vmovd h4,t1x
|
||||
vpaddd t1,hc4,hc4
|
||||
|
||||
# t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
|
||||
vpmuludq hc0,ruwy0,t1
|
||||
# t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
|
||||
vpmuludq hc1,svxz4,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
|
||||
vpmuludq hc2,svxz3,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
|
||||
vpmuludq hc3,svxz2,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
|
||||
vpmuludq hc4,svxz1,t2
|
||||
vpaddq t2,t1,t1
|
||||
# d0 = t1[0] + t1[1] + t[2] + t[3]
|
||||
vpermq $0xee,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vpsrldq $8,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vmovq t1x,d0
|
||||
|
||||
# t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
|
||||
vpmuludq hc0,ruwy1,t1
|
||||
# t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
|
||||
vpmuludq hc1,ruwy0,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
|
||||
vpmuludq hc2,svxz4,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
|
||||
vpmuludq hc3,svxz3,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
|
||||
vpmuludq hc4,svxz2,t2
|
||||
vpaddq t2,t1,t1
|
||||
# d1 = t1[0] + t1[1] + t1[3] + t1[4]
|
||||
vpermq $0xee,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vpsrldq $8,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vmovq t1x,d1
|
||||
|
||||
# t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
|
||||
vpmuludq hc0,ruwy2,t1
|
||||
# t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
|
||||
vpmuludq hc1,ruwy1,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
|
||||
vpmuludq hc2,ruwy0,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
|
||||
vpmuludq hc3,svxz4,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
|
||||
vpmuludq hc4,svxz3,t2
|
||||
vpaddq t2,t1,t1
|
||||
# d2 = t1[0] + t1[1] + t1[2] + t1[3]
|
||||
vpermq $0xee,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vpsrldq $8,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vmovq t1x,d2
|
||||
|
||||
# t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
|
||||
vpmuludq hc0,ruwy3,t1
|
||||
# t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
|
||||
vpmuludq hc1,ruwy2,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
|
||||
vpmuludq hc2,ruwy1,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
|
||||
vpmuludq hc3,ruwy0,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
|
||||
vpmuludq hc4,svxz4,t2
|
||||
vpaddq t2,t1,t1
|
||||
# d3 = t1[0] + t1[1] + t1[2] + t1[3]
|
||||
vpermq $0xee,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vpsrldq $8,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vmovq t1x,d3
|
||||
|
||||
# t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
|
||||
vpmuludq hc0,ruwy4,t1
|
||||
# t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
|
||||
vpmuludq hc1,ruwy3,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
|
||||
vpmuludq hc2,ruwy2,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
|
||||
vpmuludq hc3,ruwy1,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
|
||||
vpmuludq hc4,ruwy0,t2
|
||||
vpaddq t2,t1,t1
|
||||
# d4 = t1[0] + t1[1] + t1[2] + t1[3]
|
||||
vpermq $0xee,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vpsrldq $8,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vmovq t1x,d4
|
||||
|
||||
# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
|
||||
# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
|
||||
# amount. Careful: we must not assume the carry bits 'd0 >> 26',
|
||||
# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
|
||||
# integers. It's true in a single-block implementation, but not here.
|
||||
|
||||
# d1 += d0 >> 26
|
||||
mov d0,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d1
|
||||
# h0 = d0 & 0x3ffffff
|
||||
mov d0,%rbx
|
||||
and $0x3ffffff,%ebx
|
||||
|
||||
# d2 += d1 >> 26
|
||||
mov d1,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d2
|
||||
# h1 = d1 & 0x3ffffff
|
||||
mov d1,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h1
|
||||
|
||||
# d3 += d2 >> 26
|
||||
mov d2,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d3
|
||||
# h2 = d2 & 0x3ffffff
|
||||
mov d2,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h2
|
||||
|
||||
# d4 += d3 >> 26
|
||||
mov d3,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d4
|
||||
# h3 = d3 & 0x3ffffff
|
||||
mov d3,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h3
|
||||
|
||||
# h0 += (d4 >> 26) * 5
|
||||
mov d4,%rax
|
||||
shr $26,%rax
|
||||
lea (%rax,%rax,4),%rax
|
||||
add %rax,%rbx
|
||||
# h4 = d4 & 0x3ffffff
|
||||
mov d4,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h4
|
||||
|
||||
# h1 += h0 >> 26
|
||||
mov %rbx,%rax
|
||||
shr $26,%rax
|
||||
add %eax,h1
|
||||
# h0 = h0 & 0x3ffffff
|
||||
andl $0x3ffffff,%ebx
|
||||
mov %ebx,h0
|
||||
|
||||
add $0x40,m
|
||||
dec %rcx
|
||||
jnz .Ldoblock4
|
||||
|
||||
vzeroupper
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbx
|
||||
ret
|
||||
ENDPROC(poly1305_4block_avx2)
|
||||
@@ -1,590 +0,0 @@
|
||||
/*
|
||||
* Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.section .rodata.cst16.ANMASK, "aM", @progbits, 16
|
||||
.align 16
|
||||
ANMASK: .octa 0x0000000003ffffff0000000003ffffff
|
||||
|
||||
.section .rodata.cst16.ORMASK, "aM", @progbits, 16
|
||||
.align 16
|
||||
ORMASK: .octa 0x00000000010000000000000001000000
|
||||
|
||||
.text
|
||||
|
||||
#define h0 0x00(%rdi)
|
||||
#define h1 0x04(%rdi)
|
||||
#define h2 0x08(%rdi)
|
||||
#define h3 0x0c(%rdi)
|
||||
#define h4 0x10(%rdi)
|
||||
#define r0 0x00(%rdx)
|
||||
#define r1 0x04(%rdx)
|
||||
#define r2 0x08(%rdx)
|
||||
#define r3 0x0c(%rdx)
|
||||
#define r4 0x10(%rdx)
|
||||
#define s1 0x00(%rsp)
|
||||
#define s2 0x04(%rsp)
|
||||
#define s3 0x08(%rsp)
|
||||
#define s4 0x0c(%rsp)
|
||||
#define m %rsi
|
||||
#define h01 %xmm0
|
||||
#define h23 %xmm1
|
||||
#define h44 %xmm2
|
||||
#define t1 %xmm3
|
||||
#define t2 %xmm4
|
||||
#define t3 %xmm5
|
||||
#define t4 %xmm6
|
||||
#define mask %xmm7
|
||||
#define d0 %r8
|
||||
#define d1 %r9
|
||||
#define d2 %r10
|
||||
#define d3 %r11
|
||||
#define d4 %r12
|
||||
|
||||
ENTRY(poly1305_block_sse2)
|
||||
# %rdi: Accumulator h[5]
|
||||
# %rsi: 16 byte input block m
|
||||
# %rdx: Poly1305 key r[5]
|
||||
# %rcx: Block count
|
||||
|
||||
# This single block variant tries to improve performance by doing two
|
||||
# multiplications in parallel using SSE instructions. There is quite
|
||||
# some quardword packing involved, hence the speedup is marginal.
|
||||
|
||||
push %rbx
|
||||
push %r12
|
||||
sub $0x10,%rsp
|
||||
|
||||
# s1..s4 = r1..r4 * 5
|
||||
mov r1,%eax
|
||||
lea (%eax,%eax,4),%eax
|
||||
mov %eax,s1
|
||||
mov r2,%eax
|
||||
lea (%eax,%eax,4),%eax
|
||||
mov %eax,s2
|
||||
mov r3,%eax
|
||||
lea (%eax,%eax,4),%eax
|
||||
mov %eax,s3
|
||||
mov r4,%eax
|
||||
lea (%eax,%eax,4),%eax
|
||||
mov %eax,s4
|
||||
|
||||
movdqa ANMASK(%rip),mask
|
||||
|
||||
.Ldoblock:
|
||||
# h01 = [0, h1, 0, h0]
|
||||
# h23 = [0, h3, 0, h2]
|
||||
# h44 = [0, h4, 0, h4]
|
||||
movd h0,h01
|
||||
movd h1,t1
|
||||
movd h2,h23
|
||||
movd h3,t2
|
||||
movd h4,h44
|
||||
punpcklqdq t1,h01
|
||||
punpcklqdq t2,h23
|
||||
punpcklqdq h44,h44
|
||||
|
||||
# h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
|
||||
movd 0x00(m),t1
|
||||
movd 0x03(m),t2
|
||||
psrld $2,t2
|
||||
punpcklqdq t2,t1
|
||||
pand mask,t1
|
||||
paddd t1,h01
|
||||
# h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
|
||||
movd 0x06(m),t1
|
||||
movd 0x09(m),t2
|
||||
psrld $4,t1
|
||||
psrld $6,t2
|
||||
punpcklqdq t2,t1
|
||||
pand mask,t1
|
||||
paddd t1,h23
|
||||
# h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
|
||||
mov 0x0c(m),%eax
|
||||
shr $8,%eax
|
||||
or $0x01000000,%eax
|
||||
movd %eax,t1
|
||||
pshufd $0xc4,t1,t1
|
||||
paddd t1,h44
|
||||
|
||||
# t1[0] = h0 * r0 + h2 * s3
|
||||
# t1[1] = h1 * s4 + h3 * s2
|
||||
movd r0,t1
|
||||
movd s4,t2
|
||||
punpcklqdq t2,t1
|
||||
pmuludq h01,t1
|
||||
movd s3,t2
|
||||
movd s2,t3
|
||||
punpcklqdq t3,t2
|
||||
pmuludq h23,t2
|
||||
paddq t2,t1
|
||||
# t2[0] = h0 * r1 + h2 * s4
|
||||
# t2[1] = h1 * r0 + h3 * s3
|
||||
movd r1,t2
|
||||
movd r0,t3
|
||||
punpcklqdq t3,t2
|
||||
pmuludq h01,t2
|
||||
movd s4,t3
|
||||
movd s3,t4
|
||||
punpcklqdq t4,t3
|
||||
pmuludq h23,t3
|
||||
paddq t3,t2
|
||||
# t3[0] = h4 * s1
|
||||
# t3[1] = h4 * s2
|
||||
movd s1,t3
|
||||
movd s2,t4
|
||||
punpcklqdq t4,t3
|
||||
pmuludq h44,t3
|
||||
# d0 = t1[0] + t1[1] + t3[0]
|
||||
# d1 = t2[0] + t2[1] + t3[1]
|
||||
movdqa t1,t4
|
||||
punpcklqdq t2,t4
|
||||
punpckhqdq t2,t1
|
||||
paddq t4,t1
|
||||
paddq t3,t1
|
||||
movq t1,d0
|
||||
psrldq $8,t1
|
||||
movq t1,d1
|
||||
|
||||
# t1[0] = h0 * r2 + h2 * r0
|
||||
# t1[1] = h1 * r1 + h3 * s4
|
||||
movd r2,t1
|
||||
movd r1,t2
|
||||
punpcklqdq t2,t1
|
||||
pmuludq h01,t1
|
||||
movd r0,t2
|
||||
movd s4,t3
|
||||
punpcklqdq t3,t2
|
||||
pmuludq h23,t2
|
||||
paddq t2,t1
|
||||
# t2[0] = h0 * r3 + h2 * r1
|
||||
# t2[1] = h1 * r2 + h3 * r0
|
||||
movd r3,t2
|
||||
movd r2,t3
|
||||
punpcklqdq t3,t2
|
||||
pmuludq h01,t2
|
||||
movd r1,t3
|
||||
movd r0,t4
|
||||
punpcklqdq t4,t3
|
||||
pmuludq h23,t3
|
||||
paddq t3,t2
|
||||
# t3[0] = h4 * s3
|
||||
# t3[1] = h4 * s4
|
||||
movd s3,t3
|
||||
movd s4,t4
|
||||
punpcklqdq t4,t3
|
||||
pmuludq h44,t3
|
||||
# d2 = t1[0] + t1[1] + t3[0]
|
||||
# d3 = t2[0] + t2[1] + t3[1]
|
||||
movdqa t1,t4
|
||||
punpcklqdq t2,t4
|
||||
punpckhqdq t2,t1
|
||||
paddq t4,t1
|
||||
paddq t3,t1
|
||||
movq t1,d2
|
||||
psrldq $8,t1
|
||||
movq t1,d3
|
||||
|
||||
# t1[0] = h0 * r4 + h2 * r2
|
||||
# t1[1] = h1 * r3 + h3 * r1
|
||||
movd r4,t1
|
||||
movd r3,t2
|
||||
punpcklqdq t2,t1
|
||||
pmuludq h01,t1
|
||||
movd r2,t2
|
||||
movd r1,t3
|
||||
punpcklqdq t3,t2
|
||||
pmuludq h23,t2
|
||||
paddq t2,t1
|
||||
# t3[0] = h4 * r0
|
||||
movd r0,t3
|
||||
pmuludq h44,t3
|
||||
# d4 = t1[0] + t1[1] + t3[0]
|
||||
movdqa t1,t4
|
||||
psrldq $8,t4
|
||||
paddq t4,t1
|
||||
paddq t3,t1
|
||||
movq t1,d4
|
||||
|
||||
# d1 += d0 >> 26
|
||||
mov d0,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d1
|
||||
# h0 = d0 & 0x3ffffff
|
||||
mov d0,%rbx
|
||||
and $0x3ffffff,%ebx
|
||||
|
||||
# d2 += d1 >> 26
|
||||
mov d1,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d2
|
||||
# h1 = d1 & 0x3ffffff
|
||||
mov d1,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h1
|
||||
|
||||
# d3 += d2 >> 26
|
||||
mov d2,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d3
|
||||
# h2 = d2 & 0x3ffffff
|
||||
mov d2,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h2
|
||||
|
||||
# d4 += d3 >> 26
|
||||
mov d3,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d4
|
||||
# h3 = d3 & 0x3ffffff
|
||||
mov d3,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h3
|
||||
|
||||
# h0 += (d4 >> 26) * 5
|
||||
mov d4,%rax
|
||||
shr $26,%rax
|
||||
lea (%rax,%rax,4),%rax
|
||||
add %rax,%rbx
|
||||
# h4 = d4 & 0x3ffffff
|
||||
mov d4,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h4
|
||||
|
||||
# h1 += h0 >> 26
|
||||
mov %rbx,%rax
|
||||
shr $26,%rax
|
||||
add %eax,h1
|
||||
# h0 = h0 & 0x3ffffff
|
||||
andl $0x3ffffff,%ebx
|
||||
mov %ebx,h0
|
||||
|
||||
add $0x10,m
|
||||
dec %rcx
|
||||
jnz .Ldoblock
|
||||
|
||||
add $0x10,%rsp
|
||||
pop %r12
|
||||
pop %rbx
|
||||
ret
|
||||
ENDPROC(poly1305_block_sse2)
|
||||
|
||||
|
||||
#define u0 0x00(%r8)
|
||||
#define u1 0x04(%r8)
|
||||
#define u2 0x08(%r8)
|
||||
#define u3 0x0c(%r8)
|
||||
#define u4 0x10(%r8)
|
||||
#define hc0 %xmm0
|
||||
#define hc1 %xmm1
|
||||
#define hc2 %xmm2
|
||||
#define hc3 %xmm5
|
||||
#define hc4 %xmm6
|
||||
#define ru0 %xmm7
|
||||
#define ru1 %xmm8
|
||||
#define ru2 %xmm9
|
||||
#define ru3 %xmm10
|
||||
#define ru4 %xmm11
|
||||
#define sv1 %xmm12
|
||||
#define sv2 %xmm13
|
||||
#define sv3 %xmm14
|
||||
#define sv4 %xmm15
|
||||
#undef d0
|
||||
#define d0 %r13
|
||||
|
||||
ENTRY(poly1305_2block_sse2)
|
||||
# %rdi: Accumulator h[5]
|
||||
# %rsi: 16 byte input block m
|
||||
# %rdx: Poly1305 key r[5]
|
||||
# %rcx: Doubleblock count
|
||||
# %r8: Poly1305 derived key r^2 u[5]
|
||||
|
||||
# This two-block variant further improves performance by using loop
|
||||
# unrolled block processing. This is more straight forward and does
|
||||
# less byte shuffling, but requires a second Poly1305 key r^2:
|
||||
# h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
|
||||
|
||||
push %rbx
|
||||
push %r12
|
||||
push %r13
|
||||
|
||||
# combine r0,u0
|
||||
movd u0,ru0
|
||||
movd r0,t1
|
||||
punpcklqdq t1,ru0
|
||||
|
||||
# combine r1,u1 and s1=r1*5,v1=u1*5
|
||||
movd u1,ru1
|
||||
movd r1,t1
|
||||
punpcklqdq t1,ru1
|
||||
movdqa ru1,sv1
|
||||
pslld $2,sv1
|
||||
paddd ru1,sv1
|
||||
|
||||
# combine r2,u2 and s2=r2*5,v2=u2*5
|
||||
movd u2,ru2
|
||||
movd r2,t1
|
||||
punpcklqdq t1,ru2
|
||||
movdqa ru2,sv2
|
||||
pslld $2,sv2
|
||||
paddd ru2,sv2
|
||||
|
||||
# combine r3,u3 and s3=r3*5,v3=u3*5
|
||||
movd u3,ru3
|
||||
movd r3,t1
|
||||
punpcklqdq t1,ru3
|
||||
movdqa ru3,sv3
|
||||
pslld $2,sv3
|
||||
paddd ru3,sv3
|
||||
|
||||
# combine r4,u4 and s4=r4*5,v4=u4*5
|
||||
movd u4,ru4
|
||||
movd r4,t1
|
||||
punpcklqdq t1,ru4
|
||||
movdqa ru4,sv4
|
||||
pslld $2,sv4
|
||||
paddd ru4,sv4
|
||||
|
||||
.Ldoblock2:
|
||||
# hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
|
||||
movd 0x00(m),hc0
|
||||
movd 0x10(m),t1
|
||||
punpcklqdq t1,hc0
|
||||
pand ANMASK(%rip),hc0
|
||||
movd h0,t1
|
||||
paddd t1,hc0
|
||||
# hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
|
||||
movd 0x03(m),hc1
|
||||
movd 0x13(m),t1
|
||||
punpcklqdq t1,hc1
|
||||
psrld $2,hc1
|
||||
pand ANMASK(%rip),hc1
|
||||
movd h1,t1
|
||||
paddd t1,hc1
|
||||
# hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
|
||||
movd 0x06(m),hc2
|
||||
movd 0x16(m),t1
|
||||
punpcklqdq t1,hc2
|
||||
psrld $4,hc2
|
||||
pand ANMASK(%rip),hc2
|
||||
movd h2,t1
|
||||
paddd t1,hc2
|
||||
# hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
|
||||
movd 0x09(m),hc3
|
||||
movd 0x19(m),t1
|
||||
punpcklqdq t1,hc3
|
||||
psrld $6,hc3
|
||||
pand ANMASK(%rip),hc3
|
||||
movd h3,t1
|
||||
paddd t1,hc3
|
||||
# hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
|
||||
movd 0x0c(m),hc4
|
||||
movd 0x1c(m),t1
|
||||
punpcklqdq t1,hc4
|
||||
psrld $8,hc4
|
||||
por ORMASK(%rip),hc4
|
||||
movd h4,t1
|
||||
paddd t1,hc4
|
||||
|
||||
# t1 = [ hc0[1] * r0, hc0[0] * u0 ]
|
||||
movdqa ru0,t1
|
||||
pmuludq hc0,t1
|
||||
# t1 += [ hc1[1] * s4, hc1[0] * v4 ]
|
||||
movdqa sv4,t2
|
||||
pmuludq hc1,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc2[1] * s3, hc2[0] * v3 ]
|
||||
movdqa sv3,t2
|
||||
pmuludq hc2,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc3[1] * s2, hc3[0] * v2 ]
|
||||
movdqa sv2,t2
|
||||
pmuludq hc3,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc4[1] * s1, hc4[0] * v1 ]
|
||||
movdqa sv1,t2
|
||||
pmuludq hc4,t2
|
||||
paddq t2,t1
|
||||
# d0 = t1[0] + t1[1]
|
||||
movdqa t1,t2
|
||||
psrldq $8,t2
|
||||
paddq t2,t1
|
||||
movq t1,d0
|
||||
|
||||
# t1 = [ hc0[1] * r1, hc0[0] * u1 ]
|
||||
movdqa ru1,t1
|
||||
pmuludq hc0,t1
|
||||
# t1 += [ hc1[1] * r0, hc1[0] * u0 ]
|
||||
movdqa ru0,t2
|
||||
pmuludq hc1,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc2[1] * s4, hc2[0] * v4 ]
|
||||
movdqa sv4,t2
|
||||
pmuludq hc2,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc3[1] * s3, hc3[0] * v3 ]
|
||||
movdqa sv3,t2
|
||||
pmuludq hc3,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc4[1] * s2, hc4[0] * v2 ]
|
||||
movdqa sv2,t2
|
||||
pmuludq hc4,t2
|
||||
paddq t2,t1
|
||||
# d1 = t1[0] + t1[1]
|
||||
movdqa t1,t2
|
||||
psrldq $8,t2
|
||||
paddq t2,t1
|
||||
movq t1,d1
|
||||
|
||||
# t1 = [ hc0[1] * r2, hc0[0] * u2 ]
|
||||
movdqa ru2,t1
|
||||
pmuludq hc0,t1
|
||||
# t1 += [ hc1[1] * r1, hc1[0] * u1 ]
|
||||
movdqa ru1,t2
|
||||
pmuludq hc1,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc2[1] * r0, hc2[0] * u0 ]
|
||||
movdqa ru0,t2
|
||||
pmuludq hc2,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc3[1] * s4, hc3[0] * v4 ]
|
||||
movdqa sv4,t2
|
||||
pmuludq hc3,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc4[1] * s3, hc4[0] * v3 ]
|
||||
movdqa sv3,t2
|
||||
pmuludq hc4,t2
|
||||
paddq t2,t1
|
||||
# d2 = t1[0] + t1[1]
|
||||
movdqa t1,t2
|
||||
psrldq $8,t2
|
||||
paddq t2,t1
|
||||
movq t1,d2
|
||||
|
||||
# t1 = [ hc0[1] * r3, hc0[0] * u3 ]
|
||||
movdqa ru3,t1
|
||||
pmuludq hc0,t1
|
||||
# t1 += [ hc1[1] * r2, hc1[0] * u2 ]
|
||||
movdqa ru2,t2
|
||||
pmuludq hc1,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc2[1] * r1, hc2[0] * u1 ]
|
||||
movdqa ru1,t2
|
||||
pmuludq hc2,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc3[1] * r0, hc3[0] * u0 ]
|
||||
movdqa ru0,t2
|
||||
pmuludq hc3,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc4[1] * s4, hc4[0] * v4 ]
|
||||
movdqa sv4,t2
|
||||
pmuludq hc4,t2
|
||||
paddq t2,t1
|
||||
# d3 = t1[0] + t1[1]
|
||||
movdqa t1,t2
|
||||
psrldq $8,t2
|
||||
paddq t2,t1
|
||||
movq t1,d3
|
||||
|
||||
# t1 = [ hc0[1] * r4, hc0[0] * u4 ]
|
||||
movdqa ru4,t1
|
||||
pmuludq hc0,t1
|
||||
# t1 += [ hc1[1] * r3, hc1[0] * u3 ]
|
||||
movdqa ru3,t2
|
||||
pmuludq hc1,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc2[1] * r2, hc2[0] * u2 ]
|
||||
movdqa ru2,t2
|
||||
pmuludq hc2,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc3[1] * r1, hc3[0] * u1 ]
|
||||
movdqa ru1,t2
|
||||
pmuludq hc3,t2
|
||||
paddq t2,t1
|
||||
# t1 += [ hc4[1] * r0, hc4[0] * u0 ]
|
||||
movdqa ru0,t2
|
||||
pmuludq hc4,t2
|
||||
paddq t2,t1
|
||||
# d4 = t1[0] + t1[1]
|
||||
movdqa t1,t2
|
||||
psrldq $8,t2
|
||||
paddq t2,t1
|
||||
movq t1,d4
|
||||
|
||||
# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
|
||||
# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
|
||||
# amount. Careful: we must not assume the carry bits 'd0 >> 26',
|
||||
# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
|
||||
# integers. It's true in a single-block implementation, but not here.
|
||||
|
||||
# d1 += d0 >> 26
|
||||
mov d0,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d1
|
||||
# h0 = d0 & 0x3ffffff
|
||||
mov d0,%rbx
|
||||
and $0x3ffffff,%ebx
|
||||
|
||||
# d2 += d1 >> 26
|
||||
mov d1,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d2
|
||||
# h1 = d1 & 0x3ffffff
|
||||
mov d1,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h1
|
||||
|
||||
# d3 += d2 >> 26
|
||||
mov d2,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d3
|
||||
# h2 = d2 & 0x3ffffff
|
||||
mov d2,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h2
|
||||
|
||||
# d4 += d3 >> 26
|
||||
mov d3,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d4
|
||||
# h3 = d3 & 0x3ffffff
|
||||
mov d3,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h3
|
||||
|
||||
# h0 += (d4 >> 26) * 5
|
||||
mov d4,%rax
|
||||
shr $26,%rax
|
||||
lea (%rax,%rax,4),%rax
|
||||
add %rax,%rbx
|
||||
# h4 = d4 & 0x3ffffff
|
||||
mov d4,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h4
|
||||
|
||||
# h1 += h0 >> 26
|
||||
mov %rbx,%rax
|
||||
shr $26,%rax
|
||||
add %eax,h1
|
||||
# h0 = h0 & 0x3ffffff
|
||||
andl $0x3ffffff,%ebx
|
||||
mov %ebx,h0
|
||||
|
||||
add $0x20,m
|
||||
dec %rcx
|
||||
jnz .Ldoblock2
|
||||
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbx
|
||||
ret
|
||||
ENDPROC(poly1305_2block_sse2)
|
||||
4265
arch/x86/crypto/poly1305-x86_64-cryptogams.pl
Normal file
4265
arch/x86/crypto/poly1305-x86_64-cryptogams.pl
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,135 +1,175 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR MIT
|
||||
/*
|
||||
* Poly1305 authenticator algorithm, RFC7539, SIMD glue code
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/poly1305.h>
|
||||
#include <crypto/internal/poly1305.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/fpu/api.h>
|
||||
#include <asm/simd.h>
|
||||
#include <asm/intel-family.h>
|
||||
|
||||
struct poly1305_simd_desc_ctx {
|
||||
struct poly1305_desc_ctx base;
|
||||
/* derived key u set? */
|
||||
bool uset;
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
/* derived keys r^3, r^4 set? */
|
||||
bool wset;
|
||||
#endif
|
||||
/* derived Poly1305 key r^2 */
|
||||
u32 u[5];
|
||||
/* ... silently appended r^3 and r^4 when using AVX2 */
|
||||
asmlinkage void poly1305_init_x86_64(void *ctx,
|
||||
const u8 key[POLY1305_KEY_SIZE]);
|
||||
asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
|
||||
const size_t len, const u32 padbit);
|
||||
asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
|
||||
const u32 nonce[4]);
|
||||
asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
|
||||
const u32 nonce[4]);
|
||||
asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
|
||||
const u32 padbit);
|
||||
asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
|
||||
const u32 padbit);
|
||||
asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
|
||||
const size_t len, const u32 padbit);
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
|
||||
|
||||
struct poly1305_arch_internal {
|
||||
union {
|
||||
struct {
|
||||
u32 h[5];
|
||||
u32 is_base2_26;
|
||||
};
|
||||
u64 hs[3];
|
||||
};
|
||||
u64 r[2];
|
||||
u64 pad;
|
||||
struct { u32 r2, r1, r4, r3; } rn[9];
|
||||
};
|
||||
|
||||
asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
|
||||
const u32 *r, unsigned int blocks);
|
||||
asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
|
||||
unsigned int blocks, const u32 *u);
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
|
||||
unsigned int blocks, const u32 *u);
|
||||
static bool poly1305_use_avx2;
|
||||
#endif
|
||||
|
||||
static int poly1305_simd_init(struct shash_desc *desc)
|
||||
/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
|
||||
* the unfortunate situation of using AVX and then having to go back to scalar
|
||||
* -- because the user is silly and has called the update function from two
|
||||
* separate contexts -- then we need to convert back to the original base before
|
||||
* proceeding. It is possible to reason that the initial reduction below is
|
||||
* sufficient given the implementation invariants. However, for an avoidance of
|
||||
* doubt and because this is not performance critical, we do the full reduction
|
||||
* anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
|
||||
*/
|
||||
static void convert_to_base2_64(void *ctx)
|
||||
{
|
||||
struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc);
|
||||
struct poly1305_arch_internal *state = ctx;
|
||||
u32 cy;
|
||||
|
||||
sctx->uset = false;
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
sctx->wset = false;
|
||||
#endif
|
||||
if (!state->is_base2_26)
|
||||
return;
|
||||
|
||||
return crypto_poly1305_init(desc);
|
||||
cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
|
||||
cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
|
||||
cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
|
||||
cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
|
||||
state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
|
||||
state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
|
||||
state->hs[2] = state->h[4] >> 24;
|
||||
#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
|
||||
cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
|
||||
state->hs[2] &= 3;
|
||||
state->hs[0] += cy;
|
||||
state->hs[1] += (cy = ULT(state->hs[0], cy));
|
||||
state->hs[2] += ULT(state->hs[1], cy);
|
||||
#undef ULT
|
||||
state->is_base2_26 = 0;
|
||||
}
|
||||
|
||||
static void poly1305_simd_mult(u32 *a, const u32 *b)
|
||||
static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
|
||||
{
|
||||
u8 m[POLY1305_BLOCK_SIZE];
|
||||
|
||||
memset(m, 0, sizeof(m));
|
||||
/* The poly1305 block function adds a hi-bit to the accumulator which
|
||||
* we don't need for key multiplication; compensate for it. */
|
||||
a[4] -= 1 << 24;
|
||||
poly1305_block_sse2(a, m, b, 1);
|
||||
poly1305_init_x86_64(ctx, key);
|
||||
}
|
||||
|
||||
static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
|
||||
const u8 *src, unsigned int srclen)
|
||||
static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
|
||||
const u32 padbit)
|
||||
{
|
||||
struct poly1305_simd_desc_ctx *sctx;
|
||||
unsigned int blocks, datalen;
|
||||
struct poly1305_arch_internal *state = ctx;
|
||||
|
||||
BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base));
|
||||
sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base);
|
||||
/* SIMD disables preemption, so relax after processing each page. */
|
||||
BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
|
||||
SZ_4K % POLY1305_BLOCK_SIZE);
|
||||
|
||||
if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
|
||||
(len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
|
||||
!may_use_simd()) {
|
||||
convert_to_base2_64(ctx);
|
||||
poly1305_blocks_x86_64(ctx, inp, len, padbit);
|
||||
return;
|
||||
}
|
||||
|
||||
do {
|
||||
const size_t bytes = min_t(size_t, len, SZ_4K);
|
||||
|
||||
kernel_fpu_begin();
|
||||
if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
|
||||
poly1305_blocks_avx512(ctx, inp, bytes, padbit);
|
||||
else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
|
||||
poly1305_blocks_avx2(ctx, inp, bytes, padbit);
|
||||
else
|
||||
poly1305_blocks_avx(ctx, inp, bytes, padbit);
|
||||
kernel_fpu_end();
|
||||
|
||||
len -= bytes;
|
||||
inp += bytes;
|
||||
} while (len);
|
||||
}
|
||||
|
||||
static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
|
||||
const u32 nonce[4])
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx))
|
||||
poly1305_emit_x86_64(ctx, mac, nonce);
|
||||
else
|
||||
poly1305_emit_avx(ctx, mac, nonce);
|
||||
}
|
||||
|
||||
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
|
||||
{
|
||||
poly1305_simd_init(&dctx->h, key);
|
||||
dctx->s[0] = get_unaligned_le32(&key[16]);
|
||||
dctx->s[1] = get_unaligned_le32(&key[20]);
|
||||
dctx->s[2] = get_unaligned_le32(&key[24]);
|
||||
dctx->s[3] = get_unaligned_le32(&key[28]);
|
||||
dctx->buflen = 0;
|
||||
dctx->sset = true;
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_init_arch);
|
||||
|
||||
static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
|
||||
const u8 *inp, unsigned int len)
|
||||
{
|
||||
unsigned int acc = 0;
|
||||
if (unlikely(!dctx->sset)) {
|
||||
datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
|
||||
src += srclen - datalen;
|
||||
srclen = datalen;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
|
||||
if (unlikely(!sctx->wset)) {
|
||||
if (!sctx->uset) {
|
||||
memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
|
||||
poly1305_simd_mult(sctx->u, dctx->r.r);
|
||||
sctx->uset = true;
|
||||
}
|
||||
memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
|
||||
poly1305_simd_mult(sctx->u + 5, dctx->r.r);
|
||||
memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
|
||||
poly1305_simd_mult(sctx->u + 10, dctx->r.r);
|
||||
sctx->wset = true;
|
||||
if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
|
||||
poly1305_simd_init(&dctx->h, inp);
|
||||
inp += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
acc += POLY1305_BLOCK_SIZE;
|
||||
dctx->rset = 1;
|
||||
}
|
||||
blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
|
||||
poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
|
||||
sctx->u);
|
||||
src += POLY1305_BLOCK_SIZE * 4 * blocks;
|
||||
srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
|
||||
}
|
||||
#endif
|
||||
if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
|
||||
if (unlikely(!sctx->uset)) {
|
||||
memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
|
||||
poly1305_simd_mult(sctx->u, dctx->r.r);
|
||||
sctx->uset = true;
|
||||
if (len >= POLY1305_BLOCK_SIZE) {
|
||||
dctx->s[0] = get_unaligned_le32(&inp[0]);
|
||||
dctx->s[1] = get_unaligned_le32(&inp[4]);
|
||||
dctx->s[2] = get_unaligned_le32(&inp[8]);
|
||||
dctx->s[3] = get_unaligned_le32(&inp[12]);
|
||||
inp += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
acc += POLY1305_BLOCK_SIZE;
|
||||
dctx->sset = true;
|
||||
}
|
||||
blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
|
||||
poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
|
||||
sctx->u);
|
||||
src += POLY1305_BLOCK_SIZE * 2 * blocks;
|
||||
srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
|
||||
}
|
||||
if (srclen >= POLY1305_BLOCK_SIZE) {
|
||||
poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
|
||||
srclen -= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
return srclen;
|
||||
return acc;
|
||||
}
|
||||
|
||||
static int poly1305_simd_update(struct shash_desc *desc,
|
||||
const u8 *src, unsigned int srclen)
|
||||
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
unsigned int srclen)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
unsigned int bytes;
|
||||
|
||||
/* kernel_fpu_begin/end is costly, use fallback for small updates */
|
||||
if (srclen <= 288 || !may_use_simd())
|
||||
return crypto_poly1305_update(desc, src, srclen);
|
||||
|
||||
kernel_fpu_begin();
|
||||
unsigned int bytes, used;
|
||||
|
||||
if (unlikely(dctx->buflen)) {
|
||||
bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
@@ -139,34 +179,76 @@ static int poly1305_simd_update(struct shash_desc *desc,
|
||||
dctx->buflen += bytes;
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
poly1305_simd_blocks(dctx, dctx->buf,
|
||||
POLY1305_BLOCK_SIZE);
|
||||
if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
|
||||
poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
|
||||
bytes = poly1305_simd_blocks(dctx, src, srclen);
|
||||
src += srclen - bytes;
|
||||
srclen = bytes;
|
||||
bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
|
||||
srclen -= bytes;
|
||||
used = crypto_poly1305_setdctxkey(dctx, src, bytes);
|
||||
if (likely(bytes - used))
|
||||
poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
|
||||
src += bytes;
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
|
||||
if (unlikely(srclen)) {
|
||||
dctx->buflen = srclen;
|
||||
memcpy(dctx->buf, src, srclen);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_update_arch);
|
||||
|
||||
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
dctx->buf[dctx->buflen++] = 1;
|
||||
memset(dctx->buf + dctx->buflen, 0,
|
||||
POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
|
||||
}
|
||||
|
||||
poly1305_simd_emit(&dctx->h, dst, dctx->s);
|
||||
*dctx = (struct poly1305_desc_ctx){};
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_final_arch);
|
||||
|
||||
static int crypto_poly1305_init(struct shash_desc *desc)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
*dctx = (struct poly1305_desc_ctx){};
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_poly1305_update(struct shash_desc *desc,
|
||||
const u8 *src, unsigned int srclen)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
poly1305_update_arch(dctx, src, srclen);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
if (unlikely(!dctx->sset))
|
||||
return -ENOKEY;
|
||||
|
||||
poly1305_final_arch(dctx, dst);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg alg = {
|
||||
.digestsize = POLY1305_DIGEST_SIZE,
|
||||
.init = poly1305_simd_init,
|
||||
.update = poly1305_simd_update,
|
||||
.init = crypto_poly1305_init,
|
||||
.update = crypto_poly1305_update,
|
||||
.final = crypto_poly1305_final,
|
||||
.descsize = sizeof(struct poly1305_simd_desc_ctx),
|
||||
.descsize = sizeof(struct poly1305_desc_ctx),
|
||||
.base = {
|
||||
.cra_name = "poly1305",
|
||||
.cra_driver_name = "poly1305-simd",
|
||||
@@ -178,30 +260,33 @@ static struct shash_alg alg = {
|
||||
|
||||
static int __init poly1305_simd_mod_init(void)
|
||||
{
|
||||
if (!boot_cpu_has(X86_FEATURE_XMM2))
|
||||
return -ENODEV;
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX2) &&
|
||||
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
|
||||
alg.descsize = sizeof(struct poly1305_simd_desc_ctx);
|
||||
if (poly1305_use_avx2)
|
||||
alg.descsize += 10 * sizeof(u32);
|
||||
#endif
|
||||
return crypto_register_shash(&alg);
|
||||
if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
|
||||
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
|
||||
static_branch_enable(&poly1305_use_avx);
|
||||
if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX2) &&
|
||||
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
|
||||
static_branch_enable(&poly1305_use_avx2);
|
||||
if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
|
||||
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
|
||||
/* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
|
||||
boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
|
||||
static_branch_enable(&poly1305_use_avx512);
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
|
||||
}
|
||||
|
||||
static void __exit poly1305_simd_mod_exit(void)
|
||||
{
|
||||
crypto_unregister_shash(&alg);
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
||||
crypto_unregister_shash(&alg);
|
||||
}
|
||||
|
||||
module_init(poly1305_simd_mod_init);
|
||||
module_exit(poly1305_simd_mod_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
|
||||
MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
|
||||
MODULE_DESCRIPTION("Poly1305 authenticator");
|
||||
MODULE_ALIAS_CRYPTO("poly1305");
|
||||
MODULE_ALIAS_CRYPTO("poly1305-simd");
|
||||
|
||||
@@ -387,7 +387,7 @@ static __init int _init_events_attrs(void)
|
||||
while (amd_iommu_v2_event_descs[i].attr.attr.name)
|
||||
i++;
|
||||
|
||||
attrs = kcalloc(i + 1, sizeof(struct attribute **), GFP_KERNEL);
|
||||
attrs = kcalloc(i + 1, sizeof(*attrs), GFP_KERNEL);
|
||||
if (!attrs)
|
||||
return -ENOMEM;
|
||||
|
||||
|
||||
@@ -249,9 +249,9 @@ static void __init fpu__init_system_ctx_switch(void)
|
||||
*/
|
||||
static void __init fpu__init_parse_early_param(void)
|
||||
{
|
||||
char arg[32];
|
||||
char arg[128];
|
||||
char *argptr = arg;
|
||||
int bit;
|
||||
int arglen, res, bit;
|
||||
|
||||
if (cmdline_find_option_bool(boot_command_line, "no387"))
|
||||
setup_clear_cpu_cap(X86_FEATURE_FPU);
|
||||
@@ -271,12 +271,26 @@ static void __init fpu__init_parse_early_param(void)
|
||||
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
|
||||
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
|
||||
|
||||
if (cmdline_find_option(boot_command_line, "clearcpuid", arg,
|
||||
sizeof(arg)) &&
|
||||
get_option(&argptr, &bit) &&
|
||||
bit >= 0 &&
|
||||
bit < NCAPINTS * 32)
|
||||
setup_clear_cpu_cap(bit);
|
||||
arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
|
||||
if (arglen <= 0)
|
||||
return;
|
||||
|
||||
pr_info("Clearing CPUID bits:");
|
||||
do {
|
||||
res = get_option(&argptr, &bit);
|
||||
if (res == 0 || res == 3)
|
||||
break;
|
||||
|
||||
/* If the argument was too long, the last bit may be cut off */
|
||||
if (res == 1 && arglen >= sizeof(arg))
|
||||
break;
|
||||
|
||||
if (bit >= 0 && bit < NCAPINTS * 32) {
|
||||
pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
|
||||
setup_clear_cpu_cap(bit);
|
||||
}
|
||||
} while (res == 2);
|
||||
pr_cont("\n");
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -104,7 +104,6 @@ fs_initcall(nmi_warning_debugfs);
|
||||
|
||||
static void nmi_check_duration(struct nmiaction *action, u64 duration)
|
||||
{
|
||||
u64 whole_msecs = READ_ONCE(action->max_duration);
|
||||
int remainder_ns, decimal_msecs;
|
||||
|
||||
if (duration < nmi_longest_ns || duration < action->max_duration)
|
||||
@@ -112,12 +111,12 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration)
|
||||
|
||||
action->max_duration = duration;
|
||||
|
||||
remainder_ns = do_div(whole_msecs, (1000 * 1000));
|
||||
remainder_ns = do_div(duration, (1000 * 1000));
|
||||
decimal_msecs = remainder_ns / 1000;
|
||||
|
||||
printk_ratelimited(KERN_INFO
|
||||
"INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
|
||||
action->handler, whole_msecs, decimal_msecs);
|
||||
action->handler, duration, decimal_msecs);
|
||||
}
|
||||
|
||||
static int nmi_handle(unsigned int type, struct pt_regs *regs)
|
||||
|
||||
@@ -3561,7 +3561,7 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt)
|
||||
u64 tsc_aux = 0;
|
||||
|
||||
if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
|
||||
return emulate_gp(ctxt, 0);
|
||||
return emulate_ud(ctxt);
|
||||
ctxt->dst.val = tsc_aux;
|
||||
return X86EMUL_CONTINUE;
|
||||
}
|
||||
|
||||
@@ -6225,6 +6225,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
|
||||
cond_resched_lock(&kvm->mmu_lock);
|
||||
}
|
||||
}
|
||||
kvm_mmu_commit_zap_page(kvm, &invalid_list);
|
||||
|
||||
spin_unlock(&kvm->mmu_lock);
|
||||
srcu_read_unlock(&kvm->srcu, rcu_idx);
|
||||
|
||||
@@ -5380,6 +5380,7 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
|
||||
* - Tell IOMMU to use legacy mode for this interrupt.
|
||||
* - Retrieve ga_tag of prior interrupt remapping data.
|
||||
*/
|
||||
pi.prev_ga_tag = 0;
|
||||
pi.is_guest_mode = false;
|
||||
ret = irq_set_vcpu_affinity(host_irq, &pi);
|
||||
|
||||
|
||||
@@ -2129,11 +2129,10 @@ static void handle_bad_sector(struct bio *bio, sector_t maxsector)
|
||||
{
|
||||
char b[BDEVNAME_SIZE];
|
||||
|
||||
printk(KERN_INFO "attempt to access beyond end of device\n");
|
||||
printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n",
|
||||
bio_devname(bio, b), bio->bi_opf,
|
||||
(unsigned long long)bio_end_sector(bio),
|
||||
(long long)maxsector);
|
||||
pr_info_ratelimited("attempt to access beyond end of device\n"
|
||||
"%s: rw=%d, want=%llu, limit=%llu\n",
|
||||
bio_devname(bio, b), bio->bi_opf,
|
||||
bio_end_sector(bio), maxsector);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FAIL_MAKE_REQUEST
|
||||
|
||||
@@ -3,10 +3,11 @@ ARCH=arm64
|
||||
CLANG_TRIPLE=aarch64-linux-gnu-
|
||||
CROSS_COMPILE=aarch64-linux-androidkernel-
|
||||
CROSS_COMPILE_COMPAT=arm-linux-androidkernel-
|
||||
LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin
|
||||
LINUX_GCC_CROSS_COMPILE_COMPAT_PREBUILTS_BIN=prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin/
|
||||
LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gas/linux-x86
|
||||
LINUX_GCC_CROSS_COMPILE_COMPAT_PREBUILTS_BIN=prebuilts/gas/linux-x86
|
||||
|
||||
FILES="
|
||||
arch/arm64/boot/Image
|
||||
arch/arm64/boot/Image.gz
|
||||
vmlinux
|
||||
System.map
|
||||
|
||||
@@ -2,7 +2,7 @@ ARCH=arm
|
||||
|
||||
CLANG_TRIPLE=arm-linux-gnueabi-
|
||||
CROSS_COMPILE=arm-linux-androidkernel-
|
||||
LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin
|
||||
LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gas/linux-x86
|
||||
|
||||
FILES="
|
||||
arch/arm/boot/Image.gz
|
||||
|
||||
@@ -3,7 +3,7 @@ KMI_GENERATION=0
|
||||
|
||||
LLVM=1
|
||||
DEPMOD=depmod
|
||||
CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r383902/bin
|
||||
CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r399163b/bin
|
||||
BUILDTOOLS_PREBUILT_BIN=build/build-tools/path/linux-x86
|
||||
|
||||
EXTRA_CMDS=''
|
||||
|
||||
@@ -2,7 +2,7 @@ ARCH=x86_64
|
||||
|
||||
CLANG_TRIPLE=x86_64-linux-gnu-
|
||||
CROSS_COMPILE=x86_64-linux-androidkernel-
|
||||
LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
|
||||
LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gas/linux-x86
|
||||
|
||||
FILES="
|
||||
arch/x86/boot/bzImage
|
||||
|
||||
@@ -163,7 +163,6 @@ config CRYPTO_USER
|
||||
config CRYPTO_MANAGER_DISABLE_TESTS
|
||||
bool "Disable run-time self tests"
|
||||
default y
|
||||
depends on CRYPTO_MANAGER2
|
||||
help
|
||||
Disable run-time self tests that normally take place at
|
||||
algorithm registration.
|
||||
@@ -257,6 +256,17 @@ config CRYPTO_GLUE_HELPER_X86
|
||||
config CRYPTO_ENGINE
|
||||
tristate
|
||||
|
||||
config CRYPTO_CURVE25519
|
||||
tristate "Curve25519 algorithm"
|
||||
select CRYPTO_KPP
|
||||
select CRYPTO_LIB_CURVE25519_GENERIC
|
||||
|
||||
config CRYPTO_CURVE25519_X86
|
||||
tristate "x86_64 accelerated Curve25519 scalar multiplication library"
|
||||
depends on X86 && 64BIT
|
||||
select CRYPTO_LIB_CURVE25519_GENERIC
|
||||
select CRYPTO_ARCH_HAVE_LIB_CURVE25519
|
||||
|
||||
comment "Authenticated Encryption with Associated Data"
|
||||
|
||||
config CRYPTO_CCM
|
||||
@@ -498,12 +508,12 @@ config CRYPTO_KEYWRAP
|
||||
config CRYPTO_NHPOLY1305
|
||||
tristate
|
||||
select CRYPTO_HASH
|
||||
select CRYPTO_POLY1305
|
||||
select CRYPTO_LIB_POLY1305_GENERIC
|
||||
|
||||
config CRYPTO_ADIANTUM
|
||||
tristate "Adiantum support"
|
||||
select CRYPTO_CHACHA20
|
||||
select CRYPTO_POLY1305
|
||||
select CRYPTO_LIB_POLY1305_GENERIC
|
||||
select CRYPTO_NHPOLY1305
|
||||
help
|
||||
Adiantum is a tweakable, length-preserving encryption mode
|
||||
@@ -638,6 +648,30 @@ config CRYPTO_CRC32_MIPS
|
||||
instructions, when available.
|
||||
|
||||
|
||||
config CRYPTO_BLAKE2S
|
||||
tristate "BLAKE2s digest algorithm"
|
||||
select CRYPTO_LIB_BLAKE2S_GENERIC
|
||||
select CRYPTO_HASH
|
||||
help
|
||||
Implementation of cryptographic hash function BLAKE2s
|
||||
optimized for 8-32bit platforms and can produce digests of any size
|
||||
between 1 to 32. The keyed hash is also implemented.
|
||||
|
||||
This module provides the following algorithms:
|
||||
|
||||
- blake2s-128
|
||||
- blake2s-160
|
||||
- blake2s-224
|
||||
- blake2s-256
|
||||
|
||||
See https://blake2.net for further information.
|
||||
|
||||
config CRYPTO_BLAKE2S_X86
|
||||
tristate "BLAKE2s digest algorithm (x86 accelerated version)"
|
||||
depends on X86 && 64BIT
|
||||
select CRYPTO_LIB_BLAKE2S_GENERIC
|
||||
select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
|
||||
|
||||
config CRYPTO_CRCT10DIF
|
||||
tristate "CRCT10DIF algorithm"
|
||||
select CRYPTO_HASH
|
||||
@@ -684,6 +718,7 @@ config CRYPTO_GHASH
|
||||
config CRYPTO_POLY1305
|
||||
tristate "Poly1305 authenticator algorithm"
|
||||
select CRYPTO_HASH
|
||||
select CRYPTO_LIB_POLY1305_GENERIC
|
||||
help
|
||||
Poly1305 authenticator algorithm, RFC7539.
|
||||
|
||||
@@ -694,7 +729,8 @@ config CRYPTO_POLY1305
|
||||
config CRYPTO_POLY1305_X86_64
|
||||
tristate "Poly1305 authenticator algorithm (x86_64/SSE2/AVX2)"
|
||||
depends on X86 && 64BIT
|
||||
select CRYPTO_POLY1305
|
||||
select CRYPTO_LIB_POLY1305_GENERIC
|
||||
select CRYPTO_ARCH_HAVE_LIB_POLY1305
|
||||
help
|
||||
Poly1305 authenticator algorithm, RFC7539.
|
||||
|
||||
@@ -703,6 +739,11 @@ config CRYPTO_POLY1305_X86_64
|
||||
in IETF protocols. This is the x86_64 assembler implementation using SIMD
|
||||
instructions.
|
||||
|
||||
config CRYPTO_POLY1305_MIPS
|
||||
tristate "Poly1305 authenticator algorithm (MIPS optimized)"
|
||||
depends on CPU_MIPS32 || (CPU_MIPS64 && 64BIT)
|
||||
select CRYPTO_ARCH_HAVE_LIB_POLY1305
|
||||
|
||||
config CRYPTO_MD4
|
||||
tristate "MD4 digest algorithm"
|
||||
select CRYPTO_HASH
|
||||
@@ -1467,6 +1508,7 @@ config CRYPTO_SALSA20
|
||||
|
||||
config CRYPTO_CHACHA20
|
||||
tristate "ChaCha stream cipher algorithms"
|
||||
select CRYPTO_LIB_CHACHA_GENERIC
|
||||
select CRYPTO_BLKCIPHER
|
||||
help
|
||||
The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms.
|
||||
@@ -1487,19 +1529,20 @@ config CRYPTO_CHACHA20
|
||||
in some performance-sensitive scenarios.
|
||||
|
||||
config CRYPTO_CHACHA20_X86_64
|
||||
tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)"
|
||||
tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)"
|
||||
depends on X86 && 64BIT
|
||||
select CRYPTO_BLKCIPHER
|
||||
select CRYPTO_CHACHA20
|
||||
select CRYPTO_LIB_CHACHA_GENERIC
|
||||
select CRYPTO_ARCH_HAVE_LIB_CHACHA
|
||||
help
|
||||
ChaCha20 cipher algorithm, RFC7539.
|
||||
SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
|
||||
XChaCha20, and XChaCha12 stream ciphers.
|
||||
|
||||
ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J.
|
||||
Bernstein and further specified in RFC7539 for use in IETF protocols.
|
||||
This is the x86_64 assembler implementation using SIMD instructions.
|
||||
|
||||
See also:
|
||||
<http://cr.yp.to/chacha/chacha-20080128.pdf>
|
||||
config CRYPTO_CHACHA_MIPS
|
||||
tristate "ChaCha stream cipher algorithms (MIPS 32r2 optimized)"
|
||||
depends on CPU_MIPS32_R2
|
||||
select CRYPTO_BLKCIPHER
|
||||
select CRYPTO_ARCH_HAVE_LIB_CHACHA
|
||||
|
||||
config CRYPTO_SEED
|
||||
tristate "SEED cipher algorithm"
|
||||
@@ -1901,6 +1944,7 @@ config CRYPTO_USER_API_AEAD
|
||||
config CRYPTO_HASH_INFO
|
||||
bool
|
||||
|
||||
source "lib/crypto/Kconfig"
|
||||
source "drivers/crypto/Kconfig"
|
||||
source crypto/asymmetric_keys/Kconfig
|
||||
source certs/Kconfig
|
||||
|
||||
@@ -73,6 +73,7 @@ obj-$(CONFIG_CRYPTO_SM3) += sm3_generic.o
|
||||
obj-$(CONFIG_CRYPTO_WP512) += wp512.o
|
||||
CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
|
||||
obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
|
||||
obj-$(CONFIG_CRYPTO_BLAKE2S) += blake2s_generic.o
|
||||
obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o
|
||||
obj-$(CONFIG_CRYPTO_ECB) += ecb.o
|
||||
obj-$(CONFIG_CRYPTO_CBC) += cbc.o
|
||||
@@ -149,6 +150,7 @@ ecdh_generic-y := ecc.o
|
||||
ecdh_generic-y += ecdh.o
|
||||
ecdh_generic-y += ecdh_helper.o
|
||||
obj-$(CONFIG_CRYPTO_ECDH) += ecdh_generic.o
|
||||
obj-$(CONFIG_CRYPTO_CURVE25519) += curve25519-generic.o
|
||||
|
||||
#
|
||||
# generic algorithms and the async_tx api
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
#include <crypto/b128ops.h>
|
||||
#include <crypto/chacha.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/internal/poly1305.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <crypto/nhpoly1305.h>
|
||||
#include <crypto/scatterwalk.h>
|
||||
@@ -71,7 +72,7 @@ struct adiantum_tfm_ctx {
|
||||
struct crypto_skcipher *streamcipher;
|
||||
struct crypto_cipher *blockcipher;
|
||||
struct crypto_shash *hash;
|
||||
struct poly1305_key header_hash_key;
|
||||
struct poly1305_core_key header_hash_key;
|
||||
};
|
||||
|
||||
struct adiantum_request_ctx {
|
||||
@@ -242,13 +243,13 @@ static void adiantum_hash_header(struct skcipher_request *req)
|
||||
|
||||
BUILD_BUG_ON(sizeof(header) % POLY1305_BLOCK_SIZE != 0);
|
||||
poly1305_core_blocks(&state, &tctx->header_hash_key,
|
||||
&header, sizeof(header) / POLY1305_BLOCK_SIZE);
|
||||
&header, sizeof(header) / POLY1305_BLOCK_SIZE, 1);
|
||||
|
||||
BUILD_BUG_ON(TWEAK_SIZE % POLY1305_BLOCK_SIZE != 0);
|
||||
poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv,
|
||||
TWEAK_SIZE / POLY1305_BLOCK_SIZE);
|
||||
TWEAK_SIZE / POLY1305_BLOCK_SIZE, 1);
|
||||
|
||||
poly1305_core_emit(&state, &rctx->header_hash);
|
||||
poly1305_core_emit(&state, NULL, &rctx->header_hash);
|
||||
}
|
||||
|
||||
/* Hash the left-hand part (the "bulk") of the message using NHPoly1305 */
|
||||
|
||||
@@ -82,7 +82,7 @@ static int crypto_aead_copy_sgl(struct crypto_skcipher *null_tfm,
|
||||
SKCIPHER_REQUEST_ON_STACK(skreq, null_tfm);
|
||||
|
||||
skcipher_request_set_tfm(skreq, null_tfm);
|
||||
skcipher_request_set_callback(skreq, CRYPTO_TFM_REQ_MAY_BACKLOG,
|
||||
skcipher_request_set_callback(skreq, CRYPTO_TFM_REQ_MAY_SLEEP,
|
||||
NULL, NULL);
|
||||
skcipher_request_set_crypt(skreq, src, dst, len, NULL);
|
||||
|
||||
@@ -295,19 +295,20 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
|
||||
areq->outlen = outlen;
|
||||
|
||||
aead_request_set_callback(&areq->cra_u.aead_req,
|
||||
CRYPTO_TFM_REQ_MAY_BACKLOG,
|
||||
CRYPTO_TFM_REQ_MAY_SLEEP,
|
||||
af_alg_async_cb, areq);
|
||||
err = ctx->enc ? crypto_aead_encrypt(&areq->cra_u.aead_req) :
|
||||
crypto_aead_decrypt(&areq->cra_u.aead_req);
|
||||
|
||||
/* AIO operation in progress */
|
||||
if (err == -EINPROGRESS || err == -EBUSY)
|
||||
if (err == -EINPROGRESS)
|
||||
return -EIOCBQUEUED;
|
||||
|
||||
sock_put(sk);
|
||||
} else {
|
||||
/* Synchronous operation */
|
||||
aead_request_set_callback(&areq->cra_u.aead_req,
|
||||
CRYPTO_TFM_REQ_MAY_SLEEP |
|
||||
CRYPTO_TFM_REQ_MAY_BACKLOG,
|
||||
crypto_req_done, &ctx->wait);
|
||||
err = crypto_wait_req(ctx->enc ?
|
||||
|
||||
@@ -127,7 +127,7 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
|
||||
crypto_skcipher_decrypt(&areq->cra_u.skcipher_req);
|
||||
|
||||
/* AIO operation in progress */
|
||||
if (err == -EINPROGRESS || err == -EBUSY)
|
||||
if (err == -EINPROGRESS)
|
||||
return -EIOCBQUEUED;
|
||||
|
||||
sock_put(sk);
|
||||
|
||||
170
crypto/blake2s_generic.c
Normal file
170
crypto/blake2s_generic.c
Normal file
@@ -0,0 +1,170 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR MIT
|
||||
/*
|
||||
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||
*/
|
||||
|
||||
#include <crypto/internal/blake2s.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
|
||||
unsigned int keylen)
|
||||
{
|
||||
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
|
||||
|
||||
if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
|
||||
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
memcpy(tctx->key, key, keylen);
|
||||
tctx->keylen = keylen;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_blake2s_init(struct shash_desc *desc)
|
||||
{
|
||||
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
|
||||
struct blake2s_state *state = shash_desc_ctx(desc);
|
||||
const int outlen = crypto_shash_digestsize(desc->tfm);
|
||||
|
||||
if (tctx->keylen)
|
||||
blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
|
||||
else
|
||||
blake2s_init(state, outlen);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
|
||||
unsigned int inlen)
|
||||
{
|
||||
struct blake2s_state *state = shash_desc_ctx(desc);
|
||||
const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
|
||||
|
||||
if (unlikely(!inlen))
|
||||
return 0;
|
||||
if (inlen > fill) {
|
||||
memcpy(state->buf + state->buflen, in, fill);
|
||||
blake2s_compress_generic(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
|
||||
state->buflen = 0;
|
||||
in += fill;
|
||||
inlen -= fill;
|
||||
}
|
||||
if (inlen > BLAKE2S_BLOCK_SIZE) {
|
||||
const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
|
||||
/* Hash one less (full) block than strictly possible */
|
||||
blake2s_compress_generic(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
|
||||
in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
|
||||
inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
|
||||
}
|
||||
memcpy(state->buf + state->buflen, in, inlen);
|
||||
state->buflen += inlen;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
struct blake2s_state *state = shash_desc_ctx(desc);
|
||||
|
||||
blake2s_set_lastblock(state);
|
||||
memset(state->buf + state->buflen, 0,
|
||||
BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
|
||||
blake2s_compress_generic(state, state->buf, 1, state->buflen);
|
||||
cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
|
||||
memcpy(out, state->h, state->outlen);
|
||||
memzero_explicit(state, sizeof(*state));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg blake2s_algs[] = {{
|
||||
.base.cra_name = "blake2s-128",
|
||||
.base.cra_driver_name = "blake2s-128-generic",
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.digestsize = BLAKE2S_128_HASH_SIZE,
|
||||
.setkey = crypto_blake2s_setkey,
|
||||
.init = crypto_blake2s_init,
|
||||
.update = crypto_blake2s_update,
|
||||
.final = crypto_blake2s_final,
|
||||
.descsize = sizeof(struct blake2s_state),
|
||||
}, {
|
||||
.base.cra_name = "blake2s-160",
|
||||
.base.cra_driver_name = "blake2s-160-generic",
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.digestsize = BLAKE2S_160_HASH_SIZE,
|
||||
.setkey = crypto_blake2s_setkey,
|
||||
.init = crypto_blake2s_init,
|
||||
.update = crypto_blake2s_update,
|
||||
.final = crypto_blake2s_final,
|
||||
.descsize = sizeof(struct blake2s_state),
|
||||
}, {
|
||||
.base.cra_name = "blake2s-224",
|
||||
.base.cra_driver_name = "blake2s-224-generic",
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.digestsize = BLAKE2S_224_HASH_SIZE,
|
||||
.setkey = crypto_blake2s_setkey,
|
||||
.init = crypto_blake2s_init,
|
||||
.update = crypto_blake2s_update,
|
||||
.final = crypto_blake2s_final,
|
||||
.descsize = sizeof(struct blake2s_state),
|
||||
}, {
|
||||
.base.cra_name = "blake2s-256",
|
||||
.base.cra_driver_name = "blake2s-256-generic",
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.digestsize = BLAKE2S_256_HASH_SIZE,
|
||||
.setkey = crypto_blake2s_setkey,
|
||||
.init = crypto_blake2s_init,
|
||||
.update = crypto_blake2s_update,
|
||||
.final = crypto_blake2s_final,
|
||||
.descsize = sizeof(struct blake2s_state),
|
||||
}};
|
||||
|
||||
static int __init blake2s_mod_init(void)
|
||||
{
|
||||
return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
|
||||
}
|
||||
|
||||
static void __exit blake2s_mod_exit(void)
|
||||
{
|
||||
crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
|
||||
}
|
||||
|
||||
subsys_initcall(blake2s_mod_init);
|
||||
module_exit(blake2s_mod_exit);
|
||||
|
||||
MODULE_ALIAS_CRYPTO("blake2s-128");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-128-generic");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-160");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-160-generic");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-224");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-224-generic");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-256");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-256-generic");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
@@ -12,33 +12,12 @@
|
||||
|
||||
#include <asm/unaligned.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/chacha.h>
|
||||
#include <crypto/internal/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
static void chacha_docrypt(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes, int nrounds)
|
||||
{
|
||||
/* aligned to potentially speed up crypto_xor() */
|
||||
u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
|
||||
|
||||
if (dst != src)
|
||||
memcpy(dst, src, bytes);
|
||||
|
||||
while (bytes >= CHACHA_BLOCK_SIZE) {
|
||||
chacha_block(state, stream, nrounds);
|
||||
crypto_xor(dst, stream, CHACHA_BLOCK_SIZE);
|
||||
bytes -= CHACHA_BLOCK_SIZE;
|
||||
dst += CHACHA_BLOCK_SIZE;
|
||||
}
|
||||
if (bytes) {
|
||||
chacha_block(state, stream, nrounds);
|
||||
crypto_xor(dst, stream, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
static int chacha_stream_xor(struct skcipher_request *req,
|
||||
struct chacha_ctx *ctx, u8 *iv)
|
||||
const struct chacha_ctx *ctx, const u8 *iv)
|
||||
{
|
||||
struct skcipher_walk walk;
|
||||
u32 state[16];
|
||||
@@ -46,7 +25,7 @@ static int chacha_stream_xor(struct skcipher_request *req,
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
crypto_chacha_init(state, ctx, iv);
|
||||
chacha_init_generic(state, ctx->key, iv);
|
||||
|
||||
while (walk.nbytes > 0) {
|
||||
unsigned int nbytes = walk.nbytes;
|
||||
@@ -54,75 +33,23 @@ static int chacha_stream_xor(struct skcipher_request *req,
|
||||
if (nbytes < walk.total)
|
||||
nbytes = round_down(nbytes, walk.stride);
|
||||
|
||||
chacha_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
nbytes, ctx->nrounds);
|
||||
chacha_crypt_generic(state, walk.dst.virt.addr,
|
||||
walk.src.virt.addr, nbytes, ctx->nrounds);
|
||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void crypto_chacha_init(u32 *state, struct chacha_ctx *ctx, u8 *iv)
|
||||
{
|
||||
state[0] = 0x61707865; /* "expa" */
|
||||
state[1] = 0x3320646e; /* "nd 3" */
|
||||
state[2] = 0x79622d32; /* "2-by" */
|
||||
state[3] = 0x6b206574; /* "te k" */
|
||||
state[4] = ctx->key[0];
|
||||
state[5] = ctx->key[1];
|
||||
state[6] = ctx->key[2];
|
||||
state[7] = ctx->key[3];
|
||||
state[8] = ctx->key[4];
|
||||
state[9] = ctx->key[5];
|
||||
state[10] = ctx->key[6];
|
||||
state[11] = ctx->key[7];
|
||||
state[12] = get_unaligned_le32(iv + 0);
|
||||
state[13] = get_unaligned_le32(iv + 4);
|
||||
state[14] = get_unaligned_le32(iv + 8);
|
||||
state[15] = get_unaligned_le32(iv + 12);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(crypto_chacha_init);
|
||||
|
||||
static int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key,
|
||||
unsigned int keysize, int nrounds)
|
||||
{
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
int i;
|
||||
|
||||
if (keysize != CHACHA_KEY_SIZE)
|
||||
return -EINVAL;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
|
||||
ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
|
||||
|
||||
ctx->nrounds = nrounds;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
|
||||
unsigned int keysize)
|
||||
{
|
||||
return chacha_setkey(tfm, key, keysize, 20);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
|
||||
|
||||
int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
|
||||
unsigned int keysize)
|
||||
{
|
||||
return chacha_setkey(tfm, key, keysize, 12);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(crypto_chacha12_setkey);
|
||||
|
||||
int crypto_chacha_crypt(struct skcipher_request *req)
|
||||
static int crypto_chacha_crypt(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
|
||||
return chacha_stream_xor(req, ctx, req->iv);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(crypto_chacha_crypt);
|
||||
|
||||
int crypto_xchacha_crypt(struct skcipher_request *req)
|
||||
static int crypto_xchacha_crypt(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
@@ -131,8 +58,8 @@ int crypto_xchacha_crypt(struct skcipher_request *req)
|
||||
u8 real_iv[16];
|
||||
|
||||
/* Compute the subkey given the original key and first 128 nonce bits */
|
||||
crypto_chacha_init(state, ctx, req->iv);
|
||||
hchacha_block(state, subctx.key, ctx->nrounds);
|
||||
chacha_init_generic(state, ctx->key, req->iv);
|
||||
hchacha_block_generic(state, subctx.key, ctx->nrounds);
|
||||
subctx.nrounds = ctx->nrounds;
|
||||
|
||||
/* Build the real IV */
|
||||
@@ -142,7 +69,6 @@ int crypto_xchacha_crypt(struct skcipher_request *req)
|
||||
/* Generate the stream and XOR it with the data */
|
||||
return chacha_stream_xor(req, &subctx, real_iv);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(crypto_xchacha_crypt);
|
||||
|
||||
static struct skcipher_alg algs[] = {
|
||||
{
|
||||
@@ -157,7 +83,7 @@ static struct skcipher_alg algs[] = {
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = crypto_chacha_crypt,
|
||||
.decrypt = crypto_chacha_crypt,
|
||||
}, {
|
||||
@@ -172,7 +98,7 @@ static struct skcipher_alg algs[] = {
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = crypto_xchacha_crypt,
|
||||
.decrypt = crypto_xchacha_crypt,
|
||||
}, {
|
||||
@@ -187,7 +113,7 @@ static struct skcipher_alg algs[] = {
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha12_setkey,
|
||||
.setkey = chacha12_setkey,
|
||||
.encrypt = crypto_xchacha_crypt,
|
||||
.decrypt = crypto_xchacha_crypt,
|
||||
}
|
||||
|
||||
90
crypto/curve25519-generic.c
Normal file
90
crypto/curve25519-generic.c
Normal file
@@ -0,0 +1,90 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <crypto/curve25519.h>
|
||||
#include <crypto/internal/kpp.h>
|
||||
#include <crypto/kpp.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/scatterlist.h>
|
||||
|
||||
static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
|
||||
unsigned int len)
|
||||
{
|
||||
u8 *secret = kpp_tfm_ctx(tfm);
|
||||
|
||||
if (!len)
|
||||
curve25519_generate_secret(secret);
|
||||
else if (len == CURVE25519_KEY_SIZE &&
|
||||
crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
|
||||
memcpy(secret, buf, CURVE25519_KEY_SIZE);
|
||||
else
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int curve25519_compute_value(struct kpp_request *req)
|
||||
{
|
||||
struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
|
||||
const u8 *secret = kpp_tfm_ctx(tfm);
|
||||
u8 public_key[CURVE25519_KEY_SIZE];
|
||||
u8 buf[CURVE25519_KEY_SIZE];
|
||||
int copied, nbytes;
|
||||
u8 const *bp;
|
||||
|
||||
if (req->src) {
|
||||
copied = sg_copy_to_buffer(req->src,
|
||||
sg_nents_for_len(req->src,
|
||||
CURVE25519_KEY_SIZE),
|
||||
public_key, CURVE25519_KEY_SIZE);
|
||||
if (copied != CURVE25519_KEY_SIZE)
|
||||
return -EINVAL;
|
||||
bp = public_key;
|
||||
} else {
|
||||
bp = curve25519_base_point;
|
||||
}
|
||||
|
||||
curve25519_generic(buf, secret, bp);
|
||||
|
||||
/* might want less than we've got */
|
||||
nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
|
||||
copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
|
||||
nbytes),
|
||||
buf, nbytes);
|
||||
if (copied != nbytes)
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
|
||||
{
|
||||
return CURVE25519_KEY_SIZE;
|
||||
}
|
||||
|
||||
static struct kpp_alg curve25519_alg = {
|
||||
.base.cra_name = "curve25519",
|
||||
.base.cra_driver_name = "curve25519-generic",
|
||||
.base.cra_priority = 100,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
.base.cra_ctxsize = CURVE25519_KEY_SIZE,
|
||||
|
||||
.set_secret = curve25519_set_secret,
|
||||
.generate_public_key = curve25519_compute_value,
|
||||
.compute_shared_secret = curve25519_compute_value,
|
||||
.max_size = curve25519_max_size,
|
||||
};
|
||||
|
||||
static int curve25519_init(void)
|
||||
{
|
||||
return crypto_register_kpp(&curve25519_alg);
|
||||
}
|
||||
|
||||
static void curve25519_exit(void)
|
||||
{
|
||||
crypto_unregister_kpp(&curve25519_alg);
|
||||
}
|
||||
|
||||
subsys_initcall(curve25519_init);
|
||||
module_exit(curve25519_exit);
|
||||
|
||||
MODULE_ALIAS_CRYPTO("curve25519");
|
||||
MODULE_ALIAS_CRYPTO("curve25519-generic");
|
||||
MODULE_LICENSE("GPL");
|
||||
@@ -33,6 +33,7 @@
|
||||
#include <asm/unaligned.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/internal/poly1305.h>
|
||||
#include <crypto/nhpoly1305.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/kernel.h>
|
||||
@@ -78,7 +79,7 @@ static void process_nh_hash_value(struct nhpoly1305_state *state,
|
||||
BUILD_BUG_ON(NH_HASH_BYTES % POLY1305_BLOCK_SIZE != 0);
|
||||
|
||||
poly1305_core_blocks(&state->poly_state, &key->poly_key, state->nh_hash,
|
||||
NH_HASH_BYTES / POLY1305_BLOCK_SIZE);
|
||||
NH_HASH_BYTES / POLY1305_BLOCK_SIZE, 1);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -209,7 +210,7 @@ int crypto_nhpoly1305_final_helper(struct shash_desc *desc, u8 *dst, nh_t nh_fn)
|
||||
if (state->nh_remaining)
|
||||
process_nh_hash_value(state, key);
|
||||
|
||||
poly1305_core_emit(&state->poly_state, dst);
|
||||
poly1305_core_emit(&state->poly_state, NULL, dst);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(crypto_nhpoly1305_final_helper);
|
||||
|
||||
@@ -13,65 +13,33 @@
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/poly1305.h>
|
||||
#include <crypto/internal/poly1305.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/unaligned.h>
|
||||
|
||||
static inline u64 mlt(u64 a, u64 b)
|
||||
{
|
||||
return a * b;
|
||||
}
|
||||
|
||||
static inline u32 sr(u64 v, u_char n)
|
||||
{
|
||||
return v >> n;
|
||||
}
|
||||
|
||||
static inline u32 and(u32 v, u32 mask)
|
||||
{
|
||||
return v & mask;
|
||||
}
|
||||
|
||||
int crypto_poly1305_init(struct shash_desc *desc)
|
||||
static int crypto_poly1305_init(struct shash_desc *desc)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
poly1305_core_init(&dctx->h);
|
||||
dctx->buflen = 0;
|
||||
dctx->rset = false;
|
||||
dctx->rset = 0;
|
||||
dctx->sset = false;
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(crypto_poly1305_init);
|
||||
|
||||
void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
|
||||
{
|
||||
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
|
||||
key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff;
|
||||
key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03;
|
||||
key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff;
|
||||
key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff;
|
||||
key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(poly1305_core_setkey);
|
||||
|
||||
/*
|
||||
* Poly1305 requires a unique key for each tag, which implies that we can't set
|
||||
* it on the tfm that gets accessed by multiple users simultaneously. Instead we
|
||||
* expect the key as the first 32 bytes in the update() call.
|
||||
*/
|
||||
unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
|
||||
const u8 *src, unsigned int srclen)
|
||||
static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
|
||||
const u8 *src, unsigned int srclen)
|
||||
{
|
||||
if (!dctx->sset) {
|
||||
if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
|
||||
poly1305_core_setkey(&dctx->r, src);
|
||||
poly1305_core_setkey(&dctx->core_r, src);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
srclen -= POLY1305_BLOCK_SIZE;
|
||||
dctx->rset = true;
|
||||
dctx->rset = 2;
|
||||
}
|
||||
if (srclen >= POLY1305_BLOCK_SIZE) {
|
||||
dctx->s[0] = get_unaligned_le32(src + 0);
|
||||
@@ -85,86 +53,9 @@ unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
|
||||
}
|
||||
return srclen;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(crypto_poly1305_setdesckey);
|
||||
|
||||
static void poly1305_blocks_internal(struct poly1305_state *state,
|
||||
const struct poly1305_key *key,
|
||||
const void *src, unsigned int nblocks,
|
||||
u32 hibit)
|
||||
{
|
||||
u32 r0, r1, r2, r3, r4;
|
||||
u32 s1, s2, s3, s4;
|
||||
u32 h0, h1, h2, h3, h4;
|
||||
u64 d0, d1, d2, d3, d4;
|
||||
|
||||
if (!nblocks)
|
||||
return;
|
||||
|
||||
r0 = key->r[0];
|
||||
r1 = key->r[1];
|
||||
r2 = key->r[2];
|
||||
r3 = key->r[3];
|
||||
r4 = key->r[4];
|
||||
|
||||
s1 = r1 * 5;
|
||||
s2 = r2 * 5;
|
||||
s3 = r3 * 5;
|
||||
s4 = r4 * 5;
|
||||
|
||||
h0 = state->h[0];
|
||||
h1 = state->h[1];
|
||||
h2 = state->h[2];
|
||||
h3 = state->h[3];
|
||||
h4 = state->h[4];
|
||||
|
||||
do {
|
||||
/* h += m[i] */
|
||||
h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff;
|
||||
h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
|
||||
h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
|
||||
h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
|
||||
h4 += (get_unaligned_le32(src + 12) >> 8) | hibit;
|
||||
|
||||
/* h *= r */
|
||||
d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
|
||||
mlt(h3, s2) + mlt(h4, s1);
|
||||
d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
|
||||
mlt(h3, s3) + mlt(h4, s2);
|
||||
d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
|
||||
mlt(h3, s4) + mlt(h4, s3);
|
||||
d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
|
||||
mlt(h3, r0) + mlt(h4, s4);
|
||||
d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
|
||||
mlt(h3, r1) + mlt(h4, r0);
|
||||
|
||||
/* (partial) h %= p */
|
||||
d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
|
||||
d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
|
||||
d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
|
||||
d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
|
||||
h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
|
||||
h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
|
||||
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
} while (--nblocks);
|
||||
|
||||
state->h[0] = h0;
|
||||
state->h[1] = h1;
|
||||
state->h[2] = h2;
|
||||
state->h[3] = h3;
|
||||
state->h[4] = h4;
|
||||
}
|
||||
|
||||
void poly1305_core_blocks(struct poly1305_state *state,
|
||||
const struct poly1305_key *key,
|
||||
const void *src, unsigned int nblocks)
|
||||
{
|
||||
poly1305_blocks_internal(state, key, src, nblocks, 1 << 24);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(poly1305_core_blocks);
|
||||
|
||||
static void poly1305_blocks(struct poly1305_desc_ctx *dctx,
|
||||
const u8 *src, unsigned int srclen, u32 hibit)
|
||||
static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
unsigned int srclen)
|
||||
{
|
||||
unsigned int datalen;
|
||||
|
||||
@@ -174,12 +65,12 @@ static void poly1305_blocks(struct poly1305_desc_ctx *dctx,
|
||||
srclen = datalen;
|
||||
}
|
||||
|
||||
poly1305_blocks_internal(&dctx->h, &dctx->r,
|
||||
src, srclen / POLY1305_BLOCK_SIZE, hibit);
|
||||
poly1305_core_blocks(&dctx->h, &dctx->core_r, src,
|
||||
srclen / POLY1305_BLOCK_SIZE, 1);
|
||||
}
|
||||
|
||||
int crypto_poly1305_update(struct shash_desc *desc,
|
||||
const u8 *src, unsigned int srclen)
|
||||
static int crypto_poly1305_update(struct shash_desc *desc,
|
||||
const u8 *src, unsigned int srclen)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
unsigned int bytes;
|
||||
@@ -193,13 +84,13 @@ int crypto_poly1305_update(struct shash_desc *desc,
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
poly1305_blocks(dctx, dctx->buf,
|
||||
POLY1305_BLOCK_SIZE, 1 << 24);
|
||||
POLY1305_BLOCK_SIZE);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
|
||||
poly1305_blocks(dctx, src, srclen, 1 << 24);
|
||||
poly1305_blocks(dctx, src, srclen);
|
||||
src += srclen - (srclen % POLY1305_BLOCK_SIZE);
|
||||
srclen %= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
@@ -211,87 +102,17 @@ int crypto_poly1305_update(struct shash_desc *desc,
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(crypto_poly1305_update);
|
||||
|
||||
void poly1305_core_emit(const struct poly1305_state *state, void *dst)
|
||||
{
|
||||
u32 h0, h1, h2, h3, h4;
|
||||
u32 g0, g1, g2, g3, g4;
|
||||
u32 mask;
|
||||
|
||||
/* fully carry h */
|
||||
h0 = state->h[0];
|
||||
h1 = state->h[1];
|
||||
h2 = state->h[2];
|
||||
h3 = state->h[3];
|
||||
h4 = state->h[4];
|
||||
|
||||
h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
|
||||
h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
|
||||
h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
|
||||
h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
|
||||
h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
|
||||
|
||||
/* compute h + -p */
|
||||
g0 = h0 + 5;
|
||||
g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
|
||||
g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
|
||||
g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
|
||||
g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
|
||||
|
||||
/* select h if h < p, or h + -p if h >= p */
|
||||
mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
|
||||
g0 &= mask;
|
||||
g1 &= mask;
|
||||
g2 &= mask;
|
||||
g3 &= mask;
|
||||
g4 &= mask;
|
||||
mask = ~mask;
|
||||
h0 = (h0 & mask) | g0;
|
||||
h1 = (h1 & mask) | g1;
|
||||
h2 = (h2 & mask) | g2;
|
||||
h3 = (h3 & mask) | g3;
|
||||
h4 = (h4 & mask) | g4;
|
||||
|
||||
/* h = h % (2^128) */
|
||||
put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0);
|
||||
put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4);
|
||||
put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8);
|
||||
put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(poly1305_core_emit);
|
||||
|
||||
int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
|
||||
static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
__le32 digest[4];
|
||||
u64 f = 0;
|
||||
|
||||
if (unlikely(!dctx->sset))
|
||||
return -ENOKEY;
|
||||
|
||||
if (unlikely(dctx->buflen)) {
|
||||
dctx->buf[dctx->buflen++] = 1;
|
||||
memset(dctx->buf + dctx->buflen, 0,
|
||||
POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
|
||||
}
|
||||
|
||||
poly1305_core_emit(&dctx->h, digest);
|
||||
|
||||
/* mac = (h + s) % (2^128) */
|
||||
f = (f >> 32) + le32_to_cpu(digest[0]) + dctx->s[0];
|
||||
put_unaligned_le32(f, dst + 0);
|
||||
f = (f >> 32) + le32_to_cpu(digest[1]) + dctx->s[1];
|
||||
put_unaligned_le32(f, dst + 4);
|
||||
f = (f >> 32) + le32_to_cpu(digest[2]) + dctx->s[2];
|
||||
put_unaligned_le32(f, dst + 8);
|
||||
f = (f >> 32) + le32_to_cpu(digest[3]) + dctx->s[3];
|
||||
put_unaligned_le32(f, dst + 12);
|
||||
|
||||
poly1305_final_generic(dctx, dst);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(crypto_poly1305_final);
|
||||
|
||||
static struct shash_alg poly1305_alg = {
|
||||
.digestsize = POLY1305_DIGEST_SIZE,
|
||||
|
||||
@@ -2616,6 +2616,30 @@ static const struct alg_test_desc alg_test_descs[] = {
|
||||
.alg = "authenc(hmac(sha512),rfc3686(ctr(aes)))",
|
||||
.test = alg_test_null,
|
||||
.fips_allowed = 1,
|
||||
}, {
|
||||
.alg = "blake2s-128",
|
||||
.test = alg_test_hash,
|
||||
.suite = {
|
||||
.hash = __VECS(blakes2s_128_tv_template)
|
||||
}
|
||||
}, {
|
||||
.alg = "blake2s-160",
|
||||
.test = alg_test_hash,
|
||||
.suite = {
|
||||
.hash = __VECS(blakes2s_160_tv_template)
|
||||
}
|
||||
}, {
|
||||
.alg = "blake2s-224",
|
||||
.test = alg_test_hash,
|
||||
.suite = {
|
||||
.hash = __VECS(blakes2s_224_tv_template)
|
||||
}
|
||||
}, {
|
||||
.alg = "blake2s-256",
|
||||
.test = alg_test_hash,
|
||||
.suite = {
|
||||
.hash = __VECS(blakes2s_256_tv_template)
|
||||
}
|
||||
}, {
|
||||
.alg = "cbc(aes)",
|
||||
.test = alg_test_skcipher,
|
||||
@@ -2821,6 +2845,12 @@ static const struct alg_test_desc alg_test_descs[] = {
|
||||
.suite = {
|
||||
.cipher = __VECS(cts_mode_tv_template)
|
||||
}
|
||||
}, {
|
||||
.alg = "curve25519",
|
||||
.test = alg_test_kpp,
|
||||
.suite = {
|
||||
.kpp = __VECS(curve25519_tv_template)
|
||||
}
|
||||
}, {
|
||||
.alg = "deflate",
|
||||
.test = alg_test_comp,
|
||||
|
||||
1669
crypto/testmgr.h
1669
crypto/testmgr.h
File diff suppressed because it is too large
Load Diff
@@ -237,7 +237,7 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
|
||||
struct binder_work {
|
||||
struct list_head entry;
|
||||
|
||||
enum {
|
||||
enum binder_work_type {
|
||||
BINDER_WORK_TRANSACTION = 1,
|
||||
BINDER_WORK_TRANSACTION_COMPLETE,
|
||||
BINDER_WORK_RETURN_ERROR,
|
||||
@@ -897,27 +897,6 @@ static struct binder_work *binder_dequeue_work_head_ilocked(
|
||||
return w;
|
||||
}
|
||||
|
||||
/**
|
||||
* binder_dequeue_work_head() - Dequeues the item at head of list
|
||||
* @proc: binder_proc associated with list
|
||||
* @list: list to dequeue head
|
||||
*
|
||||
* Removes the head of the list if there are items on the list
|
||||
*
|
||||
* Return: pointer dequeued binder_work, NULL if list was empty
|
||||
*/
|
||||
static struct binder_work *binder_dequeue_work_head(
|
||||
struct binder_proc *proc,
|
||||
struct list_head *list)
|
||||
{
|
||||
struct binder_work *w;
|
||||
|
||||
binder_inner_proc_lock(proc);
|
||||
w = binder_dequeue_work_head_ilocked(list);
|
||||
binder_inner_proc_unlock(proc);
|
||||
return w;
|
||||
}
|
||||
|
||||
static void
|
||||
binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer);
|
||||
static void binder_free_thread(struct binder_thread *thread);
|
||||
@@ -4552,13 +4531,17 @@ static void binder_release_work(struct binder_proc *proc,
|
||||
struct list_head *list)
|
||||
{
|
||||
struct binder_work *w;
|
||||
enum binder_work_type wtype;
|
||||
|
||||
while (1) {
|
||||
w = binder_dequeue_work_head(proc, list);
|
||||
binder_inner_proc_lock(proc);
|
||||
w = binder_dequeue_work_head_ilocked(list);
|
||||
wtype = w ? w->type : 0;
|
||||
binder_inner_proc_unlock(proc);
|
||||
if (!w)
|
||||
return;
|
||||
|
||||
switch (w->type) {
|
||||
switch (wtype) {
|
||||
case BINDER_WORK_TRANSACTION: {
|
||||
struct binder_transaction *t;
|
||||
|
||||
@@ -4592,9 +4575,11 @@ static void binder_release_work(struct binder_proc *proc,
|
||||
kfree(death);
|
||||
binder_stats_deleted(BINDER_STAT_DEATH);
|
||||
} break;
|
||||
case BINDER_WORK_NODE:
|
||||
break;
|
||||
default:
|
||||
pr_err("unexpected work type, %d, not freed\n",
|
||||
w->type);
|
||||
wtype);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -481,7 +481,8 @@ static int really_probe(struct device *dev, struct device_driver *drv)
|
||||
drv->bus->name, __func__, drv->name, dev_name(dev));
|
||||
if (!list_empty(&dev->devres_head)) {
|
||||
dev_crit(dev, "Resources present before probing\n");
|
||||
return -EBUSY;
|
||||
ret = -EBUSY;
|
||||
goto done;
|
||||
}
|
||||
|
||||
re_probe:
|
||||
@@ -588,7 +589,7 @@ pinctrl_bind_failed:
|
||||
ret = 0;
|
||||
done:
|
||||
atomic_dec(&probe_count);
|
||||
wake_up(&probe_waitqueue);
|
||||
wake_up_all(&probe_waitqueue);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user