Merge tag 'ASB-2020-11-05_4.19-stable' of https://android.googlesource.com/kernel/common

https://source.android.com/security/bulletin/2020-11-01
CVE-2020-0423

* tag 'ASB-2020-11-05_4.19-stable': (529 commits)
  ANDROID: GKI: Enable DEBUG_INFO_DWARF4
  UPSTREAM: mm/sl[uo]b: export __kmalloc_track(_node)_caller
  BACKPORT: xfrm/compat: Translate 32-bit user_policy from sockptr
  BACKPORT: xfrm/compat: Add 32=>64-bit messages translator
  UPSTREAM: xfrm/compat: Attach xfrm dumps to 64=>32 bit translator
  UPSTREAM: xfrm/compat: Add 64=>32-bit messages translator
  BACKPORT: xfrm: Provide API to register translator module
  ANDROID: Publish uncompressed Image on aarch64
  FROMLIST: crypto: arm64/poly1305-neon - reorder PAC authentication with SP update
  UPSTREAM: crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian
  UPSTREAM: crypto: arm64/chacha - fix hchacha_block_neon() for big endian
  Linux 4.19.154
  usb: gadget: f_ncm: allow using NCM in SuperSpeed Plus gadgets.
  eeprom: at25: set minimum read/write access stride to 1
  USB: cdc-wdm: Make wdm_flush() interruptible and add wdm_fsync().
  usb: cdc-acm: add quirk to blacklist ETAS ES58X devices
  tty: serial: fsl_lpuart: fix lpuart32_poll_get_char
  net: korina: cast KSEG0 address to pointer in kfree
  ath10k: check idx validity in __ath10k_htt_rx_ring_fill_n()
  scsi: ufs: ufs-qcom: Fix race conditions caused by ufs_qcom_testbus_config()
  ...

Change-Id: I797efa1149f557c1dfab7856813cc40d1a4d60b2

Conflicts:
	drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
	mm/page_alloc.c
This commit is contained in:
Tao Huang
2020-11-03 18:36:42 +08:00
533 changed files with 48958 additions and 4620 deletions

View File

@@ -562,7 +562,7 @@
loops can be debugged more effectively on production
systems.
clearcpuid=BITNUM [X86]
clearcpuid=BITNUM[,BITNUM...] [X86]
Disable CPUID feature X for the kernel. See
arch/x86/include/asm/cpufeatures.h for the valid bit
numbers. Note the Linux specific bits are not necessarily

View File

@@ -99,16 +99,20 @@ Coarse and fast_ns access
Some additional variants exist for more specialized cases:
.. c:function:: ktime_t ktime_get_coarse_boottime( void )
.. c:function:: ktime_t ktime_get_coarse( void )
ktime_t ktime_get_coarse_boottime( void )
ktime_t ktime_get_coarse_real( void )
ktime_t ktime_get_coarse_clocktai( void )
ktime_t ktime_get_coarse_raw( void )
.. c:function:: u64 ktime_get_coarse_ns( void )
u64 ktime_get_coarse_boottime_ns( void )
u64 ktime_get_coarse_real_ns( void )
u64 ktime_get_coarse_clocktai_ns( void )
.. c:function:: void ktime_get_coarse_ts64( struct timespec64 * )
void ktime_get_coarse_boottime_ts64( struct timespec64 * )
void ktime_get_coarse_real_ts64( struct timespec64 * )
void ktime_get_coarse_clocktai_ts64( struct timespec64 * )
void ktime_get_coarse_raw_ts64( struct timespec64 * )
These are quicker than the non-coarse versions, but less accurate,
corresponding to CLOCK_MONONOTNIC_COARSE and CLOCK_REALTIME_COARSE

View File

@@ -934,12 +934,14 @@ icmp_ratelimit - INTEGER
icmp_msgs_per_sec - INTEGER
Limit maximal number of ICMP packets sent per second from this host.
Only messages whose type matches icmp_ratemask (see below) are
controlled by this limit.
controlled by this limit. For security reasons, the precise count
of messages per second is randomized.
Default: 1000
icmp_msgs_burst - INTEGER
icmp_msgs_per_sec controls number of ICMP packets sent per second,
while icmp_msgs_burst controls the burst size of these packets.
For security reasons, the precise burst size is randomized.
Default: 50
icmp_ratemask - INTEGER

View File

@@ -3901,6 +3901,7 @@ F: crypto/
F: drivers/crypto/
F: include/crypto/
F: include/linux/crypto*
F: lib/crypto/
CRYPTOGRAPHIC RANDOM NUMBER GENERATOR
M: Neil Horman <nhorman@tuxdriver.com>
@@ -15882,6 +15883,14 @@ L: linux-gpio@vger.kernel.org
S: Maintained
F: drivers/gpio/gpio-ws16c48.c
WIREGUARD SECURE NETWORK TUNNEL
M: Jason A. Donenfeld <Jason@zx2c4.com>
S: Maintained
F: drivers/net/wireguard/
F: tools/testing/selftests/wireguard/
L: wireguard@lists.zx2c4.com
L: netdev@vger.kernel.org
WISTRON LAPTOP BUTTON DRIVER
M: Miloslav Trmac <mitr@volny.cz>
S: Maintained

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
VERSION = 4
PATCHLEVEL = 19
SUBLEVEL = 149
SUBLEVEL = 154
EXTRAVERSION =
NAME = "People's Front"

File diff suppressed because it is too large Load Diff

View File

@@ -2348,6 +2348,7 @@
__sock_recv_ts_and_drops
sock_wake_async
sock_wfree
timer_reduce
unregister_net_sysctl_table
__wake_up_sync_key
__xfrm_policy_check

View File

@@ -11,5 +11,6 @@ menuconfig ARC_SOC_HSDK
select ARC_HAS_ACCL_REGS
select ARC_IRQ_NO_AUTOSAVE
select CLK_HSDK
select RESET_CONTROLLER
select RESET_HSDK
select MIGHT_HAVE_PCI

View File

@@ -120,7 +120,7 @@ ccflags-y := -fpic $(call cc-option,-mno-single-pic-base,) -fno-builtin -I$(obj)
asflags-y := -DZIMAGE
# Supply kernel BSS size to the decompressor via a linker symbol.
KBSS_SZ = $(shell echo $$(($$($(CROSS_COMPILE)nm $(obj)/../../../../vmlinux | \
KBSS_SZ = $(shell echo $$(($$($(NM) $(obj)/../../../../vmlinux | \
sed -n -e 's/^\([^ ]*\) [AB] __bss_start$$/-0x\1/p' \
-e 's/^\([^ ]*\) [AB] __bss_stop$$/+0x\1/p') )) )
LDFLAGS_vmlinux = --defsym _kernel_bss_size=$(KBSS_SZ)
@@ -166,7 +166,7 @@ $(obj)/bswapsdi2.S: $(srctree)/arch/$(SRCARCH)/lib/bswapsdi2.S
# The .data section is already discarded by the linker script so no need
# to bother about it here.
check_for_bad_syms = \
bad_syms=$$($(CROSS_COMPILE)nm $@ | sed -n 's/^.\{8\} [bc] \(.*\)/\1/p') && \
bad_syms=$$($(NM) $@ | sed -n 's/^.\{8\} [bc] \(.*\)/\1/p') && \
[ -z "$$bad_syms" ] || \
( echo "following symbols must have non local/private scope:" >&2; \
echo "$$bad_syms" >&2; rm -f $@; false )

View File

@@ -922,8 +922,10 @@
};
rngb: rngb@21b4000 {
compatible = "fsl,imx6sl-rngb", "fsl,imx25-rngb";
reg = <0x021b4000 0x4000>;
interrupts = <0 5 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clks IMX6SL_CLK_DUMMY>;
};
weim: weim@21b8000 {

View File

@@ -85,21 +85,21 @@
global_timer: timer@b0020200 {
compatible = "arm,cortex-a9-global-timer";
reg = <0xb0020200 0x100>;
interrupts = <GIC_PPI 0 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
interrupts = <GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
status = "disabled";
};
twd_timer: timer@b0020600 {
compatible = "arm,cortex-a9-twd-timer";
reg = <0xb0020600 0x20>;
interrupts = <GIC_PPI 2 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
status = "disabled";
};
twd_wdt: wdt@b0020620 {
compatible = "arm,cortex-a9-twd-wdt";
reg = <0xb0020620 0xe0>;
interrupts = <GIC_PPI 3 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
interrupts = <GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
status = "disabled";
};

View File

@@ -206,16 +206,16 @@
};
&reg_dc1sw {
regulator-min-microvolt = <3000000>;
regulator-max-microvolt = <3000000>;
regulator-min-microvolt = <3300000>;
regulator-max-microvolt = <3300000>;
regulator-name = "vcc-gmac-phy";
};
&reg_dcdc1 {
regulator-always-on;
regulator-min-microvolt = <3000000>;
regulator-max-microvolt = <3000000>;
regulator-name = "vcc-3v0";
regulator-min-microvolt = <3300000>;
regulator-max-microvolt = <3300000>;
regulator-name = "vcc-3v3";
};
&reg_dcdc2 {

View File

@@ -1,3 +1,4 @@
aesbs-core.S
sha256-core.S
sha512-core.S
poly1305-core.S

View File

@@ -125,14 +125,24 @@ config CRYPTO_CRC32_ARM_CE
select CRYPTO_HASH
config CRYPTO_CHACHA20_NEON
tristate "NEON accelerated ChaCha stream cipher algorithms"
depends on KERNEL_MODE_NEON
tristate "NEON and scalar accelerated ChaCha stream cipher algorithms"
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
select CRYPTO_ARCH_HAVE_LIB_CHACHA
config CRYPTO_POLY1305_ARM
tristate "Accelerated scalar and SIMD Poly1305 hash implementations"
select CRYPTO_HASH
select CRYPTO_ARCH_HAVE_LIB_POLY1305
config CRYPTO_NHPOLY1305_NEON
tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
depends on KERNEL_MODE_NEON
select CRYPTO_NHPOLY1305
config CRYPTO_CURVE25519_NEON
tristate "NEON accelerated Curve25519 scalar multiplication library"
depends on KERNEL_MODE_NEON
select CRYPTO_LIB_CURVE25519_GENERIC
select CRYPTO_ARCH_HAVE_LIB_CURVE25519
endif

View File

@@ -10,7 +10,9 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -53,13 +55,19 @@ aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
chacha-neon-y := chacha-scalar-core.o chacha-glue.o
chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
poly1305-arm-y := poly1305-core.o poly1305-glue.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
curve25519-neon-y := curve25519-core.o curve25519-glue.o
ifdef REGENERATE_ARM_CRYPTO
quiet_cmd_perl = PERL $@
cmd_perl = $(PERL) $(<) > $(@)
$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl
$(call cmd,perl)
$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
$(call cmd,perl)
@@ -67,4 +75,9 @@ $(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl
$(call cmd,perl)
endif
targets += sha256-core.S sha512-core.S
targets += poly1305-core.S sha256-core.S sha512-core.S
# massage the perlasm code a bit so we only get the NEON routine if we need it
poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
AFLAGS_poly1305-core.o += $(poly1305-aflags-y)

View File

@@ -0,0 +1,356 @@
// SPDX-License-Identifier: GPL-2.0
/*
* ARM NEON accelerated ChaCha and XChaCha stream ciphers,
* including ChaCha20 (RFC7539)
*
* Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
* Copyright (C) 2015 Martin Willi
*/
#include <crypto/algapi.h>
#include <crypto/internal/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/cputype.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
const u32 *state, int nrounds);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
static inline bool neon_usable(void)
{
return static_branch_likely(&use_neon) && may_use_simd();
}
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes, int nrounds)
{
u8 buf[CHACHA_BLOCK_SIZE];
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
chacha_4block_xor_neon(state, dst, src, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
while (bytes >= CHACHA_BLOCK_SIZE) {
chacha_block_xor_neon(state, dst, src, nrounds);
bytes -= CHACHA_BLOCK_SIZE;
src += CHACHA_BLOCK_SIZE;
dst += CHACHA_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
memcpy(buf, src, bytes);
chacha_block_xor_neon(state, buf, buf, nrounds);
memcpy(dst, buf, bytes);
}
}
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
{
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
hchacha_block_arm(state, stream, nrounds);
} else {
kernel_neon_begin();
hchacha_block_neon(state, stream, nrounds);
kernel_neon_end();
}
}
EXPORT_SYMBOL(hchacha_block_arch);
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
{
chacha_init_generic(state, key, iv);
}
EXPORT_SYMBOL(chacha_init_arch);
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
int nrounds)
{
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
bytes <= CHACHA_BLOCK_SIZE) {
chacha_doarm(dst, src, bytes, state, nrounds);
state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
return;
}
do {
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
kernel_neon_begin();
chacha_doneon(state, dst, src, todo, nrounds);
kernel_neon_end();
bytes -= todo;
src += todo;
dst += todo;
} while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);
static int chacha_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv,
bool neon)
{
struct skcipher_walk walk;
u32 state[16];
int err;
err = skcipher_walk_virt(&walk, req, false);
chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
nbytes, state, ctx->nrounds);
state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
} else {
kernel_neon_begin();
chacha_doneon(state, walk.dst.virt.addr,
walk.src.virt.addr, nbytes, ctx->nrounds);
kernel_neon_end();
}
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
static int do_chacha(struct skcipher_request *req, bool neon)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
return chacha_stream_xor(req, ctx, req->iv, neon);
}
static int chacha_arm(struct skcipher_request *req)
{
return do_chacha(req, false);
}
static int chacha_neon(struct skcipher_request *req)
{
return do_chacha(req, neon_usable());
}
static int do_xchacha(struct skcipher_request *req, bool neon)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct chacha_ctx subctx;
u32 state[16];
u8 real_iv[16];
chacha_init_generic(state, ctx->key, req->iv);
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
hchacha_block_arm(state, subctx.key, ctx->nrounds);
} else {
kernel_neon_begin();
hchacha_block_neon(state, subctx.key, ctx->nrounds);
kernel_neon_end();
}
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
memcpy(&real_iv[8], req->iv + 16, 8);
return chacha_stream_xor(req, &subctx, real_iv, neon);
}
static int xchacha_arm(struct skcipher_request *req)
{
return do_xchacha(req, false);
}
static int xchacha_neon(struct skcipher_request *req)
{
return do_xchacha(req, neon_usable());
}
static struct skcipher_alg arm_algs[] = {
{
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-arm",
.base.cra_priority = 200,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = chacha_arm,
.decrypt = chacha_arm,
}, {
.base.cra_name = "xchacha20",
.base.cra_driver_name = "xchacha20-arm",
.base.cra_priority = 200,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = xchacha_arm,
.decrypt = xchacha_arm,
}, {
.base.cra_name = "xchacha12",
.base.cra_driver_name = "xchacha12-arm",
.base.cra_priority = 200,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha12_setkey,
.encrypt = xchacha_arm,
.decrypt = xchacha_arm,
},
};
static struct skcipher_alg neon_algs[] = {
{
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = chacha_neon,
.decrypt = chacha_neon,
}, {
.base.cra_name = "xchacha20",
.base.cra_driver_name = "xchacha20-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}, {
.base.cra_name = "xchacha12",
.base.cra_driver_name = "xchacha12-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = chacha12_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}
};
static int __init chacha_simd_mod_init(void)
{
int err = 0;
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
if (err)
return err;
}
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
int i;
switch (read_cpuid_part()) {
case ARM_CPU_PART_CORTEX_A7:
case ARM_CPU_PART_CORTEX_A5:
/*
* The Cortex-A7 and Cortex-A5 do not perform well with
* the NEON implementation but do incredibly with the
* scalar one and use less power.
*/
for (i = 0; i < ARRAY_SIZE(neon_algs); i++)
neon_algs[i].base.cra_priority = 0;
break;
default:
static_branch_enable(&use_neon);
}
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
if (err)
crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
}
}
return err;
}
static void __exit chacha_simd_mod_fini(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
}
}
module_init(chacha_simd_mod_init);
module_exit(chacha_simd_mod_fini);
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("chacha20");
MODULE_ALIAS_CRYPTO("chacha20-arm");
MODULE_ALIAS_CRYPTO("xchacha20");
MODULE_ALIAS_CRYPTO("xchacha20-arm");
MODULE_ALIAS_CRYPTO("xchacha12");
MODULE_ALIAS_CRYPTO("xchacha12-arm");
#ifdef CONFIG_KERNEL_MODE_NEON
MODULE_ALIAS_CRYPTO("chacha20-neon");
MODULE_ALIAS_CRYPTO("xchacha20-neon");
MODULE_ALIAS_CRYPTO("xchacha12-neon");
#endif

View File

@@ -0,0 +1,460 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2018 Google, Inc.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
/*
* Design notes:
*
* 16 registers would be needed to hold the state matrix, but only 14 are
* available because 'sp' and 'pc' cannot be used. So we spill the elements
* (x8, x9) to the stack and swap them out with (x10, x11). This adds one
* 'ldrd' and one 'strd' instruction per round.
*
* All rotates are performed using the implicit rotate operand accepted by the
* 'add' and 'eor' instructions. This is faster than using explicit rotate
* instructions. To make this work, we allow the values in the second and last
* rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
* wrong rotation amount. The rotation amount is then fixed up just in time
* when the values are used. 'brot' is the number of bits the values in row 'b'
* need to be rotated right to arrive at the correct values, and 'drot'
* similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
* that they end up as (25, 24) after every round.
*/
// ChaCha state registers
X0 .req r0
X1 .req r1
X2 .req r2
X3 .req r3
X4 .req r4
X5 .req r5
X6 .req r6
X7 .req r7
X8_X10 .req r8 // shared by x8 and x10
X9_X11 .req r9 // shared by x9 and x11
X12 .req r10
X13 .req r11
X14 .req r12
X15 .req r14
.macro __rev out, in, t0, t1, t2
.if __LINUX_ARM_ARCH__ >= 6
rev \out, \in
.else
lsl \t0, \in, #24
and \t1, \in, #0xff00
and \t2, \in, #0xff0000
orr \out, \t0, \in, lsr #24
orr \out, \out, \t1, lsl #8
orr \out, \out, \t2, lsr #8
.endif
.endm
.macro _le32_bswap x, t0, t1, t2
#ifdef __ARMEB__
__rev \x, \x, \t0, \t1, \t2
#endif
.endm
.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
_le32_bswap \a, \t0, \t1, \t2
_le32_bswap \b, \t0, \t1, \t2
_le32_bswap \c, \t0, \t1, \t2
_le32_bswap \d, \t0, \t1, \t2
.endm
.macro __ldrd a, b, src, offset
#if __LINUX_ARM_ARCH__ >= 6
ldrd \a, \b, [\src, #\offset]
#else
ldr \a, [\src, #\offset]
ldr \b, [\src, #\offset + 4]
#endif
.endm
.macro __strd a, b, dst, offset
#if __LINUX_ARM_ARCH__ >= 6
strd \a, \b, [\dst, #\offset]
#else
str \a, [\dst, #\offset]
str \b, [\dst, #\offset + 4]
#endif
.endm
.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
// a += b; d ^= a; d = rol(d, 16);
add \a1, \a1, \b1, ror #brot
add \a2, \a2, \b2, ror #brot
eor \d1, \a1, \d1, ror #drot
eor \d2, \a2, \d2, ror #drot
// drot == 32 - 16 == 16
// c += d; b ^= c; b = rol(b, 12);
add \c1, \c1, \d1, ror #16
add \c2, \c2, \d2, ror #16
eor \b1, \c1, \b1, ror #brot
eor \b2, \c2, \b2, ror #brot
// brot == 32 - 12 == 20
// a += b; d ^= a; d = rol(d, 8);
add \a1, \a1, \b1, ror #20
add \a2, \a2, \b2, ror #20
eor \d1, \a1, \d1, ror #16
eor \d2, \a2, \d2, ror #16
// drot == 32 - 8 == 24
// c += d; b ^= c; b = rol(b, 7);
add \c1, \c1, \d1, ror #24
add \c2, \c2, \d2, ror #24
eor \b1, \c1, \b1, ror #20
eor \b2, \c2, \b2, ror #20
// brot == 32 - 7 == 25
.endm
.macro _doubleround
// column round
// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
_halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
// save (x8, x9); restore (x10, x11)
__strd X8_X10, X9_X11, sp, 0
__ldrd X8_X10, X9_X11, sp, 8
// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
_halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
.set brot, 25
.set drot, 24
// diagonal round
// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
_halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
// save (x10, x11); restore (x8, x9)
__strd X8_X10, X9_X11, sp, 8
__ldrd X8_X10, X9_X11, sp, 0
// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
_halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
.endm
.macro _chacha_permute nrounds
.set brot, 0
.set drot, 0
.rept \nrounds / 2
_doubleround
.endr
.endm
.macro _chacha nrounds
.Lnext_block\@:
// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
// Registers contain x0-x9,x12-x15.
// Do the core ChaCha permutation to update x0-x15.
_chacha_permute \nrounds
add sp, #8
// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
// Registers contain x0-x9,x12-x15.
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
push {X8_X10, X9_X11, X12, X13, X14, X15}
// Load (OUT, IN, LEN).
ldr r14, [sp, #96]
ldr r12, [sp, #100]
ldr r11, [sp, #104]
orr r10, r14, r12
// Use slow path if fewer than 64 bytes remain.
cmp r11, #64
blt .Lxor_slowpath\@
// Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
// ARMv6+, since ldmia and stmia (used below) still require alignment.
tst r10, #3
bne .Lxor_slowpath\@
// Fast path: XOR 64 bytes of aligned data.
// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
// x0-x3
__ldrd r8, r9, sp, 32
__ldrd r10, r11, sp, 40
add X0, X0, r8
add X1, X1, r9
add X2, X2, r10
add X3, X3, r11
_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
ldmia r12!, {r8-r11}
eor X0, X0, r8
eor X1, X1, r9
eor X2, X2, r10
eor X3, X3, r11
stmia r14!, {X0-X3}
// x4-x7
__ldrd r8, r9, sp, 48
__ldrd r10, r11, sp, 56
add X4, r8, X4, ror #brot
add X5, r9, X5, ror #brot
ldmia r12!, {X0-X3}
add X6, r10, X6, ror #brot
add X7, r11, X7, ror #brot
_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
eor X4, X4, X0
eor X5, X5, X1
eor X6, X6, X2
eor X7, X7, X3
stmia r14!, {X4-X7}
// x8-x15
pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
__ldrd r8, r9, sp, 32
__ldrd r10, r11, sp, 40
add r0, r0, r8 // x8
add r1, r1, r9 // x9
add r6, r6, r10 // x10
add r7, r7, r11 // x11
_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
ldmia r12!, {r8-r11}
eor r0, r0, r8 // x8
eor r1, r1, r9 // x9
eor r6, r6, r10 // x10
eor r7, r7, r11 // x11
stmia r14!, {r0,r1,r6,r7}
ldmia r12!, {r0,r1,r6,r7}
__ldrd r8, r9, sp, 48
__ldrd r10, r11, sp, 56
add r2, r8, r2, ror #drot // x12
add r3, r9, r3, ror #drot // x13
add r4, r10, r4, ror #drot // x14
add r5, r11, r5, ror #drot // x15
_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
ldr r9, [sp, #72] // load LEN
eor r2, r2, r0 // x12
eor r3, r3, r1 // x13
eor r4, r4, r6 // x14
eor r5, r5, r7 // x15
subs r9, #64 // decrement and check LEN
stmia r14!, {r2-r5}
beq .Ldone\@
.Lprepare_for_next_block\@:
// Stack: x0-x15 OUT IN LEN
// Increment block counter (x12)
add r8, #1
// Store updated (OUT, IN, LEN)
str r14, [sp, #64]
str r12, [sp, #68]
str r9, [sp, #72]
mov r14, sp
// Store updated block counter (x12)
str r8, [sp, #48]
sub sp, #16
// Reload state and do next block
ldmia r14!, {r0-r11} // load x0-x11
__strd r10, r11, sp, 8 // store x10-x11 before state
ldmia r14, {r10-r12,r14} // load x12-x15
b .Lnext_block\@
.Lxor_slowpath\@:
// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
// We handle it by storing the 64 bytes of keystream to the stack, then
// XOR-ing the needed portion with the data.
// Allocate keystream buffer
sub sp, #64
mov r14, sp
// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
// Save keystream for x0-x3
__ldrd r8, r9, sp, 96
__ldrd r10, r11, sp, 104
add X0, X0, r8
add X1, X1, r9
add X2, X2, r10
add X3, X3, r11
_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
stmia r14!, {X0-X3}
// Save keystream for x4-x7
__ldrd r8, r9, sp, 112
__ldrd r10, r11, sp, 120
add X4, r8, X4, ror #brot
add X5, r9, X5, ror #brot
add X6, r10, X6, ror #brot
add X7, r11, X7, ror #brot
_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
add r8, sp, #64
stmia r14!, {X4-X7}
// Save keystream for x8-x15
ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
__ldrd r8, r9, sp, 128
__ldrd r10, r11, sp, 136
add r0, r0, r8 // x8
add r1, r1, r9 // x9
add r6, r6, r10 // x10
add r7, r7, r11 // x11
_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
stmia r14!, {r0,r1,r6,r7}
__ldrd r8, r9, sp, 144
__ldrd r10, r11, sp, 152
add r2, r8, r2, ror #drot // x12
add r3, r9, r3, ror #drot // x13
add r4, r10, r4, ror #drot // x14
add r5, r11, r5, ror #drot // x15
_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
stmia r14, {r2-r5}
// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
// Registers: r8 is block counter, r12 is IN.
ldr r9, [sp, #168] // LEN
ldr r14, [sp, #160] // OUT
cmp r9, #64
mov r0, sp
movle r1, r9
movgt r1, #64
// r1 is number of bytes to XOR, in range [1, 64]
.if __LINUX_ARM_ARCH__ < 6
orr r2, r12, r14
tst r2, #3 // IN or OUT misaligned?
bne .Lxor_next_byte\@
.endif
// XOR a word at a time
.rept 16
subs r1, #4
blt .Lxor_words_done\@
ldr r2, [r12], #4
ldr r3, [r0], #4
eor r2, r2, r3
str r2, [r14], #4
.endr
b .Lxor_slowpath_done\@
.Lxor_words_done\@:
ands r1, r1, #3
beq .Lxor_slowpath_done\@
// XOR a byte at a time
.Lxor_next_byte\@:
ldrb r2, [r12], #1
ldrb r3, [r0], #1
eor r2, r2, r3
strb r2, [r14], #1
subs r1, #1
bne .Lxor_next_byte\@
.Lxor_slowpath_done\@:
subs r9, #64
add sp, #96
bgt .Lprepare_for_next_block\@
.Ldone\@:
.endm // _chacha
/*
* void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
* const u32 *state, int nrounds);
*/
ENTRY(chacha_doarm)
cmp r2, #0 // len == 0?
reteq lr
ldr ip, [sp]
cmp ip, #12
push {r0-r2,r4-r11,lr}
// Push state x0-x15 onto stack.
// Also store an extra copy of x10-x11 just before the state.
add X12, r3, #48
ldm X12, {X12,X13,X14,X15}
push {X12,X13,X14,X15}
sub sp, sp, #64
__ldrd X8_X10, X9_X11, r3, 40
__strd X8_X10, X9_X11, sp, 8
__strd X8_X10, X9_X11, sp, 56
ldm r3, {X0-X9_X11}
__strd X0, X1, sp, 16
__strd X2, X3, sp, 24
__strd X4, X5, sp, 32
__strd X6, X7, sp, 40
__strd X8_X10, X9_X11, sp, 48
beq 1f
_chacha 20
0: add sp, #76
pop {r4-r11, pc}
1: _chacha 12
b 0b
ENDPROC(chacha_doarm)
/*
* void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
*/
ENTRY(hchacha_block_arm)
push {r1,r4-r11,lr}
cmp r2, #12 // ChaCha12 ?
mov r14, r0
ldmia r14!, {r0-r11} // load x0-x11
push {r10-r11} // store x10-x11 to stack
ldm r14, {r10-r12,r14} // load x12-x15
sub sp, #8
beq 1f
_chacha_permute 20
// Skip over (unused0-unused1, x10-x11)
0: add sp, #16
// Fix up rotations of x12-x15
ror X12, X12, #drot
ror X13, X13, #drot
pop {r4} // load 'out'
ror X14, X14, #drot
ror X15, X15, #drot
// Store (x0-x3,x12-x15) to 'out'
stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
pop {r4-r11,pc}
1: _chacha_permute 12
b 0b
ENDPROC(hchacha_block_arm)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,135 @@
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*
* Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
* began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
* manually reworked for use in kernel space.
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/internal/kpp.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/jump_label.h>
#include <linux/scatterlist.h>
#include <crypto/curve25519.h>
asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
const u8 secret[CURVE25519_KEY_SIZE],
const u8 basepoint[CURVE25519_KEY_SIZE]);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
const u8 scalar[CURVE25519_KEY_SIZE],
const u8 point[CURVE25519_KEY_SIZE])
{
if (static_branch_likely(&have_neon) && may_use_simd()) {
kernel_neon_begin();
curve25519_neon(out, scalar, point);
kernel_neon_end();
} else {
curve25519_generic(out, scalar, point);
}
}
EXPORT_SYMBOL(curve25519_arch);
void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
const u8 secret[CURVE25519_KEY_SIZE])
{
return curve25519_arch(pub, secret, curve25519_base_point);
}
EXPORT_SYMBOL(curve25519_base_arch);
static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
unsigned int len)
{
u8 *secret = kpp_tfm_ctx(tfm);
if (!len)
curve25519_generate_secret(secret);
else if (len == CURVE25519_KEY_SIZE &&
crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
memcpy(secret, buf, CURVE25519_KEY_SIZE);
else
return -EINVAL;
return 0;
}
static int curve25519_compute_value(struct kpp_request *req)
{
struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
const u8 *secret = kpp_tfm_ctx(tfm);
u8 public_key[CURVE25519_KEY_SIZE];
u8 buf[CURVE25519_KEY_SIZE];
int copied, nbytes;
u8 const *bp;
if (req->src) {
copied = sg_copy_to_buffer(req->src,
sg_nents_for_len(req->src,
CURVE25519_KEY_SIZE),
public_key, CURVE25519_KEY_SIZE);
if (copied != CURVE25519_KEY_SIZE)
return -EINVAL;
bp = public_key;
} else {
bp = curve25519_base_point;
}
curve25519_arch(buf, secret, bp);
/* might want less than we've got */
nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
nbytes),
buf, nbytes);
if (copied != nbytes)
return -EINVAL;
return 0;
}
static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
{
return CURVE25519_KEY_SIZE;
}
static struct kpp_alg curve25519_alg = {
.base.cra_name = "curve25519",
.base.cra_driver_name = "curve25519-neon",
.base.cra_priority = 200,
.base.cra_module = THIS_MODULE,
.base.cra_ctxsize = CURVE25519_KEY_SIZE,
.set_secret = curve25519_set_secret,
.generate_public_key = curve25519_compute_value,
.compute_shared_secret = curve25519_compute_value,
.max_size = curve25519_max_size,
};
static int __init mod_init(void)
{
if (elf_hwcap & HWCAP_NEON) {
static_branch_enable(&have_neon);
return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
crypto_register_kpp(&curve25519_alg) : 0;
}
return 0;
}
static void __exit mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && elf_hwcap & HWCAP_NEON)
crypto_unregister_kpp(&curve25519_alg);
}
module_init(mod_init);
module_exit(mod_exit);
MODULE_ALIAS_CRYPTO("curve25519");
MODULE_ALIAS_CRYPTO("curve25519-neon");
MODULE_LICENSE("GPL v2");

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,272 @@
// SPDX-License-Identifier: GPL-2.0
/*
* OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
*
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/algapi.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/poly1305.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/jump_label.h>
#include <linux/module.h>
void poly1305_init_arm(void *state, const u8 *key);
void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
{
}
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
{
poly1305_init_arm(&dctx->h, key);
dctx->s[0] = get_unaligned_le32(key + 16);
dctx->s[1] = get_unaligned_le32(key + 20);
dctx->s[2] = get_unaligned_le32(key + 24);
dctx->s[3] = get_unaligned_le32(key + 28);
dctx->buflen = 0;
}
EXPORT_SYMBOL(poly1305_init_arch);
static int arm_poly1305_init(struct shash_desc *desc)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
dctx->buflen = 0;
dctx->rset = 0;
dctx->sset = false;
return 0;
}
static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
u32 len, u32 hibit, bool do_neon)
{
if (unlikely(!dctx->sset)) {
if (!dctx->rset) {
poly1305_init_arm(&dctx->h, src);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->rset = 1;
}
if (len >= POLY1305_BLOCK_SIZE) {
dctx->s[0] = get_unaligned_le32(src + 0);
dctx->s[1] = get_unaligned_le32(src + 4);
dctx->s[2] = get_unaligned_le32(src + 8);
dctx->s[3] = get_unaligned_le32(src + 12);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->sset = true;
}
if (len < POLY1305_BLOCK_SIZE)
return;
}
len &= ~(POLY1305_BLOCK_SIZE - 1);
if (static_branch_likely(&have_neon) && likely(do_neon))
poly1305_blocks_neon(&dctx->h, src, len, hibit);
else
poly1305_blocks_arm(&dctx->h, src, len, hibit);
}
static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
const u8 *src, u32 len, bool do_neon)
{
if (unlikely(dctx->buflen)) {
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
memcpy(dctx->buf + dctx->buflen, src, bytes);
src += bytes;
len -= bytes;
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
arm_poly1305_blocks(dctx, dctx->buf,
POLY1305_BLOCK_SIZE, 1, false);
dctx->buflen = 0;
}
}
if (likely(len >= POLY1305_BLOCK_SIZE)) {
arm_poly1305_blocks(dctx, src, len, 1, do_neon);
src += round_down(len, POLY1305_BLOCK_SIZE);
len %= POLY1305_BLOCK_SIZE;
}
if (unlikely(len)) {
dctx->buflen = len;
memcpy(dctx->buf, src, len);
}
}
static int arm_poly1305_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
arm_poly1305_do_update(dctx, src, srclen, false);
return 0;
}
static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
const u8 *src,
unsigned int srclen)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
bool do_neon = may_use_simd() && srclen > 128;
if (static_branch_likely(&have_neon) && do_neon)
kernel_neon_begin();
arm_poly1305_do_update(dctx, src, srclen, do_neon);
if (static_branch_likely(&have_neon) && do_neon)
kernel_neon_end();
return 0;
}
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int nbytes)
{
bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
may_use_simd();
if (unlikely(dctx->buflen)) {
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
memcpy(dctx->buf + dctx->buflen, src, bytes);
src += bytes;
nbytes -= bytes;
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
poly1305_blocks_arm(&dctx->h, dctx->buf,
POLY1305_BLOCK_SIZE, 1);
dctx->buflen = 0;
}
}
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
if (static_branch_likely(&have_neon) && do_neon) {
do {
unsigned int todo = min_t(unsigned int, len, SZ_4K);
kernel_neon_begin();
poly1305_blocks_neon(&dctx->h, src, todo, 1);
kernel_neon_end();
len -= todo;
src += todo;
} while (len);
} else {
poly1305_blocks_arm(&dctx->h, src, len, 1);
src += len;
}
nbytes %= POLY1305_BLOCK_SIZE;
}
if (unlikely(nbytes)) {
dctx->buflen = nbytes;
memcpy(dctx->buf, src, nbytes);
}
}
EXPORT_SYMBOL(poly1305_update_arch);
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
{
if (unlikely(dctx->buflen)) {
dctx->buf[dctx->buflen++] = 1;
memset(dctx->buf + dctx->buflen, 0,
POLY1305_BLOCK_SIZE - dctx->buflen);
poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
}
poly1305_emit_arm(&dctx->h, dst, dctx->s);
*dctx = (struct poly1305_desc_ctx){};
}
EXPORT_SYMBOL(poly1305_final_arch);
static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
if (unlikely(!dctx->sset))
return -ENOKEY;
poly1305_final_arch(dctx, dst);
return 0;
}
static struct shash_alg arm_poly1305_algs[] = {{
.init = arm_poly1305_init,
.update = arm_poly1305_update,
.final = arm_poly1305_final,
.digestsize = POLY1305_DIGEST_SIZE,
.descsize = sizeof(struct poly1305_desc_ctx),
.base.cra_name = "poly1305",
.base.cra_driver_name = "poly1305-arm",
.base.cra_priority = 150,
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
#ifdef CONFIG_KERNEL_MODE_NEON
}, {
.init = arm_poly1305_init,
.update = arm_poly1305_update_neon,
.final = arm_poly1305_final,
.digestsize = POLY1305_DIGEST_SIZE,
.descsize = sizeof(struct poly1305_desc_ctx),
.base.cra_name = "poly1305",
.base.cra_driver_name = "poly1305-neon",
.base.cra_priority = 200,
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
#endif
}};
static int __init arm_poly1305_mod_init(void)
{
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
(elf_hwcap & HWCAP_NEON))
static_branch_enable(&have_neon);
else if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
/* register only the first entry */
return crypto_register_shash(&arm_poly1305_algs[0]);
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
crypto_register_shashes(arm_poly1305_algs,
ARRAY_SIZE(arm_poly1305_algs)) : 0;
}
static void __exit arm_poly1305_mod_exit(void)
{
if (!IS_REACHABLE(CONFIG_CRYPTO_HASH))
return;
if (!static_branch_likely(&have_neon)) {
crypto_unregister_shash(&arm_poly1305_algs[0]);
return;
}
crypto_unregister_shashes(arm_poly1305_algs,
ARRAY_SIZE(arm_poly1305_algs));
}
module_init(arm_poly1305_mod_init);
module_exit(arm_poly1305_mod_exit);
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("poly1305");
MODULE_ALIAS_CRYPTO("poly1305-arm");
MODULE_ALIAS_CRYPTO("poly1305-neon");

View File

@@ -1261,20 +1261,28 @@ static void __init l2c310_of_parse(const struct device_node *np,
ret = of_property_read_u32(np, "prefetch-data", &val);
if (ret == 0) {
if (val)
if (val) {
prefetch |= L310_PREFETCH_CTRL_DATA_PREFETCH;
else
*aux_val |= L310_PREFETCH_CTRL_DATA_PREFETCH;
} else {
prefetch &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
*aux_val &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
}
*aux_mask &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
} else if (ret != -EINVAL) {
pr_err("L2C-310 OF prefetch-data property value is missing\n");
}
ret = of_property_read_u32(np, "prefetch-instr", &val);
if (ret == 0) {
if (val)
if (val) {
prefetch |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
else
*aux_val |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
} else {
prefetch &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
*aux_val &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
}
*aux_mask &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
} else if (ret != -EINVAL) {
pr_err("L2C-310 OF prefetch-instr property value is missing\n");
}

View File

@@ -10,12 +10,13 @@ obj-vdso := $(addprefix $(obj)/, $(obj-vdso))
ccflags-y := -fPIC -fno-common -fno-builtin -fno-stack-protector
ccflags-y += -DDISABLE_BRANCH_PROFILING
VDSO_LDFLAGS := -Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1
VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
VDSO_LDFLAGS += -nostdlib -shared
VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--build-id)
VDSO_LDFLAGS += $(call cc-ldoption, -fuse-ld=bfd)
ldflags-$(CONFIG_CPU_ENDIAN_BE8) := --be8
ldflags-y := -Bsymbolic --no-undefined -soname=linux-vdso.so.1 \
-z max-page-size=4096 -z common-page-size=4096 \
-nostdlib -shared $(ldflags-y) \
$(call ld-option, --hash-style=sysv) \
$(call ld-option, --build-id) \
-T
obj-$(CONFIG_VDSO) += vdso.o
extra-$(CONFIG_VDSO) += vdso.lds
@@ -37,8 +38,8 @@ KCOV_INSTRUMENT := n
$(obj)/vdso.o : $(obj)/vdso.so
# Link rule for the .so file
$(obj)/vdso.so.raw: $(src)/vdso.lds $(obj-vdso) FORCE
$(call if_changed,vdsold)
$(obj)/vdso.so.raw: $(obj)/vdso.lds $(obj-vdso) FORCE
$(call if_changed,ld)
$(obj)/vdso.so.dbg: $(obj)/vdso.so.raw $(obj)/vdsomunge FORCE
$(call if_changed,vdsomunge)
@@ -48,11 +49,6 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
$(obj)/%.so: $(obj)/%.so.dbg FORCE
$(call if_changed,objcopy)
# Actual build commands
quiet_cmd_vdsold = VDSO $@
cmd_vdsold = $(CC) $(c_flags) $(VDSO_LDFLAGS) \
-Wl,-T $(filter %.lds,$^) $(filter %.o,$^) -o $@
quiet_cmd_vdsomunge = MUNGE $@
cmd_vdsomunge = $(objtree)/$(obj)/vdsomunge $< $@

View File

@@ -10,7 +10,7 @@
#
# Copyright (C) 1995-2001 by Russell King
LDFLAGS_vmlinux :=--no-undefined -X
LDFLAGS_vmlinux :=--no-undefined -X -z norelro
CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET)
GZFLAGS :=-9
@@ -18,7 +18,7 @@ ifeq ($(CONFIG_RELOCATABLE), y)
# Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour
# for relative relocs, since this leads to better Image compression
# with the relocation offsets always being zero.
LDFLAGS_vmlinux += -shared -Bsymbolic -z notext -z norelro \
LDFLAGS_vmlinux += -shared -Bsymbolic -z notext \
$(call ld-option, --no-apply-dynamic-relocs)
endif

View File

@@ -151,6 +151,7 @@
};
&qspi {
status = "okay";
flash@0 {
#address-cells = <1>;
#size-cells = <1>;

View File

@@ -877,7 +877,7 @@
reg-names = "mdp_phys";
interrupt-parent = <&mdss>;
interrupts = <0 0>;
interrupts = <0>;
clocks = <&gcc GCC_MDSS_AHB_CLK>,
<&gcc GCC_MDSS_AXI_CLK>,
@@ -909,7 +909,7 @@
reg-names = "dsi_ctrl";
interrupt-parent = <&mdss>;
interrupts = <4 0>;
interrupts = <4>;
assigned-clocks = <&gcc BYTE0_CLK_SRC>,
<&gcc PCLK0_CLK_SRC>;

View File

@@ -99,7 +99,7 @@
wcd_codec: codec@f000 {
compatible = "qcom,pm8916-wcd-analog-codec";
reg = <0xf000 0x200>;
reg = <0xf000>;
reg-names = "pmic-codec-core";
clocks = <&gcc GCC_CODEC_DIGCODEC_CLK>;
clock-names = "mclk";

View File

@@ -411,7 +411,7 @@
};
i2c0: i2c@ff020000 {
compatible = "cdns,i2c-r1p14", "cdns,i2c-r1p10";
compatible = "cdns,i2c-r1p14";
status = "disabled";
interrupt-parent = <&gic>;
interrupts = <0 17 4>;
@@ -421,7 +421,7 @@
};
i2c1: i2c@ff030000 {
compatible = "cdns,i2c-r1p14", "cdns,i2c-r1p10";
compatible = "cdns,i2c-r1p14";
status = "disabled";
interrupt-parent = <&gic>;
interrupts = <0 18 4>;

View File

@@ -77,7 +77,6 @@ CONFIG_ARM_SCMI_PROTOCOL=y
CONFIG_ARM_SCPI_PROTOCOL=y
# CONFIG_ARM_SCPI_POWER_DOMAIN is not set
# CONFIG_EFI_ARMSTUB_DTB_LOADER is not set
CONFIG_ARM64_CRYPTO=y
CONFIG_CRYPTO_SHA2_ARM64_CE=y
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
CONFIG_JUMP_LABEL=y
@@ -246,6 +245,7 @@ CONFIG_DM_VERITY_FEC=y
CONFIG_DM_BOW=y
CONFIG_NETDEVICES=y
CONFIG_DUMMY=y
CONFIG_WIREGUARD=y
CONFIG_TUN=y
CONFIG_VETH=y
# CONFIG_ETHERNET is not set
@@ -358,6 +358,7 @@ CONFIG_HID_NINTENDO=y
CONFIG_HID_SONY=y
CONFIG_HID_STEAM=y
CONFIG_USB_HIDDEV=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_OTG=y
CONFIG_USB_XHCI_HCD=y
CONFIG_USB_GADGET=y
@@ -502,6 +503,7 @@ CONFIG_CRC8=y
CONFIG_XZ_DEC=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_DWARF4=y
# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
CONFIG_MAGIC_SYSRQ=y

View File

@@ -1,2 +1,3 @@
sha256-core.S
sha512-core.S
poly1305-core.S

View File

@@ -106,10 +106,17 @@ config CRYPTO_AES_ARM64_NEON_BLK
select CRYPTO_SIMD
config CRYPTO_CHACHA20_NEON
tristate "NEON accelerated ChaCha20 symmetric cipher"
tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
select CRYPTO_LIB_CHACHA_GENERIC
select CRYPTO_ARCH_HAVE_LIB_CHACHA
config CRYPTO_POLY1305_NEON
tristate "Poly1305 hash function using scalar or NEON instructions"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_ARCH_HAVE_LIB_POLY1305
config CRYPTO_AES_ARM64_BS
tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"

View File

@@ -53,8 +53,12 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
sha512-arm64-y := sha512-glue.o sha512-core.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
poly1305-neon-y := poly1305-core.o poly1305-glue.o
AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
@@ -71,6 +75,9 @@ ifdef REGENERATE_ARM64_CRYPTO
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $(<) void $(@)
$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
$(call cmd,perlasm)
$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
$(call cmd,perlasm)
@@ -78,4 +85,4 @@ $(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
$(call cmd,perlasm)
endif
targets += sha256-core.S sha512-core.S
targets += poly1305-core.S sha256-core.S sha512-core.S

View File

@@ -1,13 +1,13 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
* ChaCha/XChaCha NEON helper functions
*
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
* Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Based on:
* Originally based on:
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
*
* Copyright (C) 2015 Martin Willi
@@ -19,29 +19,27 @@
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/cache.h>
.text
.align 6
ENTRY(chacha20_block_xor_neon)
// x0: Input state matrix, s
// x1: 1 data block output, o
// x2: 1 data block input, i
/*
* chacha_permute - permute one block
*
* Permute one 64-byte block where the state matrix is stored in the four NEON
* registers v0-v3. It performs matrix operations on four words in parallel,
* but requires shuffling to rearrange the words after each round.
*
* The round count is given in w3.
*
* Clobbers: w3, x10, v4, v12
*/
chacha_permute:
//
// This function encrypts one ChaCha20 block by loading the state matrix
// in four NEON registers. It performs matrix operation on four words in
// parallel, but requires shuffling to rearrange the words after each
// round.
//
// x0..3 = s0..3
adr x3, ROT8
ld1 {v0.4s-v3.4s}, [x0]
ld1 {v8.4s-v11.4s}, [x0]
ld1 {v12.4s}, [x3]
mov x3, #10
adr_l x10, ROT8
ld1 {v12.4s}, [x10]
.Ldoubleround:
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
@@ -102,9 +100,27 @@ ENTRY(chacha20_block_xor_neon)
// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
ext v3.16b, v3.16b, v3.16b, #4
subs x3, x3, #1
subs w3, w3, #2
b.ne .Ldoubleround
ret
ENDPROC(chacha_permute)
ENTRY(chacha_block_xor_neon)
// x0: Input state matrix, s
// x1: 1 data block output, o
// x2: 1 data block input, i
// w3: nrounds
stp x29, x30, [sp, #-16]!
mov x29, sp
// x0..3 = s0..3
ld1 {v0.4s-v3.4s}, [x0]
ld1 {v8.4s-v11.4s}, [x0]
bl chacha_permute
ld1 {v4.16b-v7.16b}, [x2]
// o0 = i0 ^ (x0 + s0)
@@ -125,71 +141,156 @@ ENTRY(chacha20_block_xor_neon)
st1 {v0.16b-v3.16b}, [x1]
ldp x29, x30, [sp], #16
ret
ENDPROC(chacha20_block_xor_neon)
ENDPROC(chacha_block_xor_neon)
ENTRY(hchacha_block_neon)
// x0: Input state matrix, s
// x1: output (8 32-bit words)
// w2: nrounds
stp x29, x30, [sp, #-16]!
mov x29, sp
ld1 {v0.4s-v3.4s}, [x0]
mov w3, w2
bl chacha_permute
st1 {v0.4s}, [x1], #16
st1 {v3.4s}, [x1]
ldp x29, x30, [sp], #16
ret
ENDPROC(hchacha_block_neon)
a0 .req w12
a1 .req w13
a2 .req w14
a3 .req w15
a4 .req w16
a5 .req w17
a6 .req w19
a7 .req w20
a8 .req w21
a9 .req w22
a10 .req w23
a11 .req w24
a12 .req w25
a13 .req w26
a14 .req w27
a15 .req w28
.align 6
ENTRY(chacha20_4block_xor_neon)
ENTRY(chacha_4block_xor_neon)
frame_push 10
// x0: Input state matrix, s
// x1: 4 data blocks output, o
// x2: 4 data blocks input, i
// w3: nrounds
// x4: byte count
adr_l x10, .Lpermute
and x5, x4, #63
add x10, x10, x5
add x11, x10, #64
//
// This function encrypts four consecutive ChaCha20 blocks by loading
// This function encrypts four consecutive ChaCha blocks by loading
// the state matrix in NEON registers four times. The algorithm performs
// each operation on the corresponding word of each state matrix, hence
// requires no word shuffling. For final XORing step we transpose the
// matrix by interleaving 32- and then 64-bit words, which allows us to
// do XOR in NEON registers.
//
adr x3, CTRINC // ... and ROT8
ld1 {v30.4s-v31.4s}, [x3]
// At the same time, a fifth block is encrypted in parallel using
// scalar registers
//
adr_l x9, CTRINC // ... and ROT8
ld1 {v30.4s-v31.4s}, [x9]
// x0..15[0-3] = s0..3[0..3]
mov x4, x0
ld4r { v0.4s- v3.4s}, [x4], #16
ld4r { v4.4s- v7.4s}, [x4], #16
ld4r { v8.4s-v11.4s}, [x4], #16
ld4r {v12.4s-v15.4s}, [x4]
add x8, x0, #16
ld4r { v0.4s- v3.4s}, [x0]
ld4r { v4.4s- v7.4s}, [x8], #16
ld4r { v8.4s-v11.4s}, [x8], #16
ld4r {v12.4s-v15.4s}, [x8]
// x12 += counter values 0-3
mov a0, v0.s[0]
mov a1, v1.s[0]
mov a2, v2.s[0]
mov a3, v3.s[0]
mov a4, v4.s[0]
mov a5, v5.s[0]
mov a6, v6.s[0]
mov a7, v7.s[0]
mov a8, v8.s[0]
mov a9, v9.s[0]
mov a10, v10.s[0]
mov a11, v11.s[0]
mov a12, v12.s[0]
mov a13, v13.s[0]
mov a14, v14.s[0]
mov a15, v15.s[0]
// x12 += counter values 1-4
add v12.4s, v12.4s, v30.4s
mov x3, #10
.Ldoubleround4:
// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
add v0.4s, v0.4s, v4.4s
add a0, a0, a4
add v1.4s, v1.4s, v5.4s
add a1, a1, a5
add v2.4s, v2.4s, v6.4s
add a2, a2, a6
add v3.4s, v3.4s, v7.4s
add a3, a3, a7
eor v12.16b, v12.16b, v0.16b
eor a12, a12, a0
eor v13.16b, v13.16b, v1.16b
eor a13, a13, a1
eor v14.16b, v14.16b, v2.16b
eor a14, a14, a2
eor v15.16b, v15.16b, v3.16b
eor a15, a15, a3
rev32 v12.8h, v12.8h
ror a12, a12, #16
rev32 v13.8h, v13.8h
ror a13, a13, #16
rev32 v14.8h, v14.8h
ror a14, a14, #16
rev32 v15.8h, v15.8h
ror a15, a15, #16
// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
add v8.4s, v8.4s, v12.4s
add a8, a8, a12
add v9.4s, v9.4s, v13.4s
add a9, a9, a13
add v10.4s, v10.4s, v14.4s
add a10, a10, a14
add v11.4s, v11.4s, v15.4s
add a11, a11, a15
eor v16.16b, v4.16b, v8.16b
eor a4, a4, a8
eor v17.16b, v5.16b, v9.16b
eor a5, a5, a9
eor v18.16b, v6.16b, v10.16b
eor a6, a6, a10
eor v19.16b, v7.16b, v11.16b
eor a7, a7, a11
shl v4.4s, v16.4s, #12
shl v5.4s, v17.4s, #12
@@ -197,42 +298,66 @@ ENTRY(chacha20_4block_xor_neon)
shl v7.4s, v19.4s, #12
sri v4.4s, v16.4s, #20
ror a4, a4, #20
sri v5.4s, v17.4s, #20
ror a5, a5, #20
sri v6.4s, v18.4s, #20
ror a6, a6, #20
sri v7.4s, v19.4s, #20
ror a7, a7, #20
// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
add v0.4s, v0.4s, v4.4s
add a0, a0, a4
add v1.4s, v1.4s, v5.4s
add a1, a1, a5
add v2.4s, v2.4s, v6.4s
add a2, a2, a6
add v3.4s, v3.4s, v7.4s
add a3, a3, a7
eor v12.16b, v12.16b, v0.16b
eor a12, a12, a0
eor v13.16b, v13.16b, v1.16b
eor a13, a13, a1
eor v14.16b, v14.16b, v2.16b
eor a14, a14, a2
eor v15.16b, v15.16b, v3.16b
eor a15, a15, a3
tbl v12.16b, {v12.16b}, v31.16b
ror a12, a12, #24
tbl v13.16b, {v13.16b}, v31.16b
ror a13, a13, #24
tbl v14.16b, {v14.16b}, v31.16b
ror a14, a14, #24
tbl v15.16b, {v15.16b}, v31.16b
ror a15, a15, #24
// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
add v8.4s, v8.4s, v12.4s
add a8, a8, a12
add v9.4s, v9.4s, v13.4s
add a9, a9, a13
add v10.4s, v10.4s, v14.4s
add a10, a10, a14
add v11.4s, v11.4s, v15.4s
add a11, a11, a15
eor v16.16b, v4.16b, v8.16b
eor a4, a4, a8
eor v17.16b, v5.16b, v9.16b
eor a5, a5, a9
eor v18.16b, v6.16b, v10.16b
eor a6, a6, a10
eor v19.16b, v7.16b, v11.16b
eor a7, a7, a11
shl v4.4s, v16.4s, #7
shl v5.4s, v17.4s, #7
@@ -240,42 +365,66 @@ ENTRY(chacha20_4block_xor_neon)
shl v7.4s, v19.4s, #7
sri v4.4s, v16.4s, #25
ror a4, a4, #25
sri v5.4s, v17.4s, #25
ror a5, a5, #25
sri v6.4s, v18.4s, #25
ror a6, a6, #25
sri v7.4s, v19.4s, #25
ror a7, a7, #25
// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
add v0.4s, v0.4s, v5.4s
add a0, a0, a5
add v1.4s, v1.4s, v6.4s
add a1, a1, a6
add v2.4s, v2.4s, v7.4s
add a2, a2, a7
add v3.4s, v3.4s, v4.4s
add a3, a3, a4
eor v15.16b, v15.16b, v0.16b
eor a15, a15, a0
eor v12.16b, v12.16b, v1.16b
eor a12, a12, a1
eor v13.16b, v13.16b, v2.16b
eor a13, a13, a2
eor v14.16b, v14.16b, v3.16b
eor a14, a14, a3
rev32 v15.8h, v15.8h
ror a15, a15, #16
rev32 v12.8h, v12.8h
ror a12, a12, #16
rev32 v13.8h, v13.8h
ror a13, a13, #16
rev32 v14.8h, v14.8h
ror a14, a14, #16
// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
add v10.4s, v10.4s, v15.4s
add a10, a10, a15
add v11.4s, v11.4s, v12.4s
add a11, a11, a12
add v8.4s, v8.4s, v13.4s
add a8, a8, a13
add v9.4s, v9.4s, v14.4s
add a9, a9, a14
eor v16.16b, v5.16b, v10.16b
eor a5, a5, a10
eor v17.16b, v6.16b, v11.16b
eor a6, a6, a11
eor v18.16b, v7.16b, v8.16b
eor a7, a7, a8
eor v19.16b, v4.16b, v9.16b
eor a4, a4, a9
shl v5.4s, v16.4s, #12
shl v6.4s, v17.4s, #12
@@ -283,42 +432,66 @@ ENTRY(chacha20_4block_xor_neon)
shl v4.4s, v19.4s, #12
sri v5.4s, v16.4s, #20
ror a5, a5, #20
sri v6.4s, v17.4s, #20
ror a6, a6, #20
sri v7.4s, v18.4s, #20
ror a7, a7, #20
sri v4.4s, v19.4s, #20
ror a4, a4, #20
// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
add v0.4s, v0.4s, v5.4s
add a0, a0, a5
add v1.4s, v1.4s, v6.4s
add a1, a1, a6
add v2.4s, v2.4s, v7.4s
add a2, a2, a7
add v3.4s, v3.4s, v4.4s
add a3, a3, a4
eor v15.16b, v15.16b, v0.16b
eor a15, a15, a0
eor v12.16b, v12.16b, v1.16b
eor a12, a12, a1
eor v13.16b, v13.16b, v2.16b
eor a13, a13, a2
eor v14.16b, v14.16b, v3.16b
eor a14, a14, a3
tbl v15.16b, {v15.16b}, v31.16b
ror a15, a15, #24
tbl v12.16b, {v12.16b}, v31.16b
ror a12, a12, #24
tbl v13.16b, {v13.16b}, v31.16b
ror a13, a13, #24
tbl v14.16b, {v14.16b}, v31.16b
ror a14, a14, #24
// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
add v10.4s, v10.4s, v15.4s
add a10, a10, a15
add v11.4s, v11.4s, v12.4s
add a11, a11, a12
add v8.4s, v8.4s, v13.4s
add a8, a8, a13
add v9.4s, v9.4s, v14.4s
add a9, a9, a14
eor v16.16b, v5.16b, v10.16b
eor a5, a5, a10
eor v17.16b, v6.16b, v11.16b
eor a6, a6, a11
eor v18.16b, v7.16b, v8.16b
eor a7, a7, a8
eor v19.16b, v4.16b, v9.16b
eor a4, a4, a9
shl v5.4s, v16.4s, #7
shl v6.4s, v17.4s, #7
@@ -326,11 +499,15 @@ ENTRY(chacha20_4block_xor_neon)
shl v4.4s, v19.4s, #7
sri v5.4s, v16.4s, #25
ror a5, a5, #25
sri v6.4s, v17.4s, #25
ror a6, a6, #25
sri v7.4s, v18.4s, #25
ror a7, a7, #25
sri v4.4s, v19.4s, #25
ror a4, a4, #25
subs x3, x3, #1
subs w3, w3, #2
b.ne .Ldoubleround4
ld4r {v16.4s-v19.4s}, [x0], #16
@@ -344,9 +521,21 @@ ENTRY(chacha20_4block_xor_neon)
// x2[0-3] += s0[2]
// x3[0-3] += s0[3]
add v0.4s, v0.4s, v16.4s
mov w6, v16.s[0]
mov w7, v17.s[0]
add v1.4s, v1.4s, v17.4s
mov w8, v18.s[0]
mov w9, v19.s[0]
add v2.4s, v2.4s, v18.4s
add a0, a0, w6
add a1, a1, w7
add v3.4s, v3.4s, v19.4s
add a2, a2, w8
add a3, a3, w9
CPU_BE( rev a0, a0 )
CPU_BE( rev a1, a1 )
CPU_BE( rev a2, a2 )
CPU_BE( rev a3, a3 )
ld4r {v24.4s-v27.4s}, [x0], #16
ld4r {v28.4s-v31.4s}, [x0]
@@ -356,95 +545,316 @@ ENTRY(chacha20_4block_xor_neon)
// x6[0-3] += s1[2]
// x7[0-3] += s1[3]
add v4.4s, v4.4s, v20.4s
mov w6, v20.s[0]
mov w7, v21.s[0]
add v5.4s, v5.4s, v21.4s
mov w8, v22.s[0]
mov w9, v23.s[0]
add v6.4s, v6.4s, v22.4s
add a4, a4, w6
add a5, a5, w7
add v7.4s, v7.4s, v23.4s
add a6, a6, w8
add a7, a7, w9
CPU_BE( rev a4, a4 )
CPU_BE( rev a5, a5 )
CPU_BE( rev a6, a6 )
CPU_BE( rev a7, a7 )
// x8[0-3] += s2[0]
// x9[0-3] += s2[1]
// x10[0-3] += s2[2]
// x11[0-3] += s2[3]
add v8.4s, v8.4s, v24.4s
mov w6, v24.s[0]
mov w7, v25.s[0]
add v9.4s, v9.4s, v25.4s
mov w8, v26.s[0]
mov w9, v27.s[0]
add v10.4s, v10.4s, v26.4s
add a8, a8, w6
add a9, a9, w7
add v11.4s, v11.4s, v27.4s
add a10, a10, w8
add a11, a11, w9
CPU_BE( rev a8, a8 )
CPU_BE( rev a9, a9 )
CPU_BE( rev a10, a10 )
CPU_BE( rev a11, a11 )
// x12[0-3] += s3[0]
// x13[0-3] += s3[1]
// x14[0-3] += s3[2]
// x15[0-3] += s3[3]
add v12.4s, v12.4s, v28.4s
mov w6, v28.s[0]
mov w7, v29.s[0]
add v13.4s, v13.4s, v29.4s
mov w8, v30.s[0]
mov w9, v31.s[0]
add v14.4s, v14.4s, v30.4s
add a12, a12, w6
add a13, a13, w7
add v15.4s, v15.4s, v31.4s
add a14, a14, w8
add a15, a15, w9
CPU_BE( rev a12, a12 )
CPU_BE( rev a13, a13 )
CPU_BE( rev a14, a14 )
CPU_BE( rev a15, a15 )
// interleave 32-bit words in state n, n+1
ldp w6, w7, [x2], #64
zip1 v16.4s, v0.4s, v1.4s
ldp w8, w9, [x2, #-56]
eor a0, a0, w6
zip2 v17.4s, v0.4s, v1.4s
eor a1, a1, w7
zip1 v18.4s, v2.4s, v3.4s
eor a2, a2, w8
zip2 v19.4s, v2.4s, v3.4s
eor a3, a3, w9
ldp w6, w7, [x2, #-48]
zip1 v20.4s, v4.4s, v5.4s
ldp w8, w9, [x2, #-40]
eor a4, a4, w6
zip2 v21.4s, v4.4s, v5.4s
eor a5, a5, w7
zip1 v22.4s, v6.4s, v7.4s
eor a6, a6, w8
zip2 v23.4s, v6.4s, v7.4s
eor a7, a7, w9
ldp w6, w7, [x2, #-32]
zip1 v24.4s, v8.4s, v9.4s
ldp w8, w9, [x2, #-24]
eor a8, a8, w6
zip2 v25.4s, v8.4s, v9.4s
eor a9, a9, w7
zip1 v26.4s, v10.4s, v11.4s
eor a10, a10, w8
zip2 v27.4s, v10.4s, v11.4s
eor a11, a11, w9
ldp w6, w7, [x2, #-16]
zip1 v28.4s, v12.4s, v13.4s
ldp w8, w9, [x2, #-8]
eor a12, a12, w6
zip2 v29.4s, v12.4s, v13.4s
eor a13, a13, w7
zip1 v30.4s, v14.4s, v15.4s
eor a14, a14, w8
zip2 v31.4s, v14.4s, v15.4s
eor a15, a15, w9
mov x3, #64
subs x5, x4, #128
add x6, x5, x2
csel x3, x3, xzr, ge
csel x2, x2, x6, ge
// interleave 64-bit words in state n, n+2
zip1 v0.2d, v16.2d, v18.2d
zip2 v4.2d, v16.2d, v18.2d
stp a0, a1, [x1], #64
zip1 v8.2d, v17.2d, v19.2d
zip2 v12.2d, v17.2d, v19.2d
ld1 {v16.16b-v19.16b}, [x2], #64
stp a2, a3, [x1, #-56]
ld1 {v16.16b-v19.16b}, [x2], x3
subs x6, x4, #192
ccmp x3, xzr, #4, lt
add x7, x6, x2
csel x3, x3, xzr, eq
csel x2, x2, x7, eq
zip1 v1.2d, v20.2d, v22.2d
zip2 v5.2d, v20.2d, v22.2d
stp a4, a5, [x1, #-48]
zip1 v9.2d, v21.2d, v23.2d
zip2 v13.2d, v21.2d, v23.2d
ld1 {v20.16b-v23.16b}, [x2], #64
stp a6, a7, [x1, #-40]
ld1 {v20.16b-v23.16b}, [x2], x3
subs x7, x4, #256
ccmp x3, xzr, #4, lt
add x8, x7, x2
csel x3, x3, xzr, eq
csel x2, x2, x8, eq
zip1 v2.2d, v24.2d, v26.2d
zip2 v6.2d, v24.2d, v26.2d
stp a8, a9, [x1, #-32]
zip1 v10.2d, v25.2d, v27.2d
zip2 v14.2d, v25.2d, v27.2d
ld1 {v24.16b-v27.16b}, [x2], #64
stp a10, a11, [x1, #-24]
ld1 {v24.16b-v27.16b}, [x2], x3
subs x8, x4, #320
ccmp x3, xzr, #4, lt
add x9, x8, x2
csel x2, x2, x9, eq
zip1 v3.2d, v28.2d, v30.2d
zip2 v7.2d, v28.2d, v30.2d
stp a12, a13, [x1, #-16]
zip1 v11.2d, v29.2d, v31.2d
zip2 v15.2d, v29.2d, v31.2d
stp a14, a15, [x1, #-8]
ld1 {v28.16b-v31.16b}, [x2]
// xor with corresponding input, write to output
tbnz x5, #63, 0f
eor v16.16b, v16.16b, v0.16b
eor v17.16b, v17.16b, v1.16b
eor v18.16b, v18.16b, v2.16b
eor v19.16b, v19.16b, v3.16b
st1 {v16.16b-v19.16b}, [x1], #64
cbz x5, .Lout
tbnz x6, #63, 1f
eor v20.16b, v20.16b, v4.16b
eor v21.16b, v21.16b, v5.16b
st1 {v16.16b-v19.16b}, [x1], #64
eor v22.16b, v22.16b, v6.16b
eor v23.16b, v23.16b, v7.16b
st1 {v20.16b-v23.16b}, [x1], #64
cbz x6, .Lout
tbnz x7, #63, 2f
eor v24.16b, v24.16b, v8.16b
eor v25.16b, v25.16b, v9.16b
st1 {v20.16b-v23.16b}, [x1], #64
eor v26.16b, v26.16b, v10.16b
eor v27.16b, v27.16b, v11.16b
eor v28.16b, v28.16b, v12.16b
st1 {v24.16b-v27.16b}, [x1], #64
cbz x7, .Lout
tbnz x8, #63, 3f
eor v28.16b, v28.16b, v12.16b
eor v29.16b, v29.16b, v13.16b
eor v30.16b, v30.16b, v14.16b
eor v31.16b, v31.16b, v15.16b
st1 {v28.16b-v31.16b}, [x1]
.Lout: frame_pop
ret
ENDPROC(chacha20_4block_xor_neon)
CTRINC: .word 0, 1, 2, 3
// fewer than 128 bytes of in/output
0: ld1 {v8.16b}, [x10]
ld1 {v9.16b}, [x11]
movi v10.16b, #16
sub x2, x1, #64
add x1, x1, x5
ld1 {v16.16b-v19.16b}, [x2]
tbl v4.16b, {v0.16b-v3.16b}, v8.16b
tbx v20.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v5.16b, {v0.16b-v3.16b}, v8.16b
tbx v21.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v6.16b, {v0.16b-v3.16b}, v8.16b
tbx v22.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v7.16b, {v0.16b-v3.16b}, v8.16b
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
eor v20.16b, v20.16b, v4.16b
eor v21.16b, v21.16b, v5.16b
eor v22.16b, v22.16b, v6.16b
eor v23.16b, v23.16b, v7.16b
st1 {v20.16b-v23.16b}, [x1]
b .Lout
// fewer than 192 bytes of in/output
1: ld1 {v8.16b}, [x10]
ld1 {v9.16b}, [x11]
movi v10.16b, #16
add x1, x1, x6
tbl v0.16b, {v4.16b-v7.16b}, v8.16b
tbx v20.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v1.16b, {v4.16b-v7.16b}, v8.16b
tbx v21.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v2.16b, {v4.16b-v7.16b}, v8.16b
tbx v22.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v3.16b, {v4.16b-v7.16b}, v8.16b
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
eor v20.16b, v20.16b, v0.16b
eor v21.16b, v21.16b, v1.16b
eor v22.16b, v22.16b, v2.16b
eor v23.16b, v23.16b, v3.16b
st1 {v20.16b-v23.16b}, [x1]
b .Lout
// fewer than 256 bytes of in/output
2: ld1 {v4.16b}, [x10]
ld1 {v5.16b}, [x11]
movi v6.16b, #16
add x1, x1, x7
tbl v0.16b, {v8.16b-v11.16b}, v4.16b
tbx v24.16b, {v20.16b-v23.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v1.16b, {v8.16b-v11.16b}, v4.16b
tbx v25.16b, {v20.16b-v23.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v2.16b, {v8.16b-v11.16b}, v4.16b
tbx v26.16b, {v20.16b-v23.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v3.16b, {v8.16b-v11.16b}, v4.16b
tbx v27.16b, {v20.16b-v23.16b}, v5.16b
eor v24.16b, v24.16b, v0.16b
eor v25.16b, v25.16b, v1.16b
eor v26.16b, v26.16b, v2.16b
eor v27.16b, v27.16b, v3.16b
st1 {v24.16b-v27.16b}, [x1]
b .Lout
// fewer than 320 bytes of in/output
3: ld1 {v4.16b}, [x10]
ld1 {v5.16b}, [x11]
movi v6.16b, #16
add x1, x1, x8
tbl v0.16b, {v12.16b-v15.16b}, v4.16b
tbx v28.16b, {v24.16b-v27.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v1.16b, {v12.16b-v15.16b}, v4.16b
tbx v29.16b, {v24.16b-v27.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v2.16b, {v12.16b-v15.16b}, v4.16b
tbx v30.16b, {v24.16b-v27.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v3.16b, {v12.16b-v15.16b}, v4.16b
tbx v31.16b, {v24.16b-v27.16b}, v5.16b
eor v28.16b, v28.16b, v0.16b
eor v29.16b, v29.16b, v1.16b
eor v30.16b, v30.16b, v2.16b
eor v31.16b, v31.16b, v3.16b
st1 {v28.16b-v31.16b}, [x1]
b .Lout
ENDPROC(chacha_4block_xor_neon)
.section ".rodata", "a", %progbits
.align L1_CACHE_SHIFT
.Lpermute:
.set .Li, 0
.rept 192
.byte (.Li - 64)
.set .Li, .Li + 1
.endr
CTRINC: .word 1, 2, 3, 4
ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f

View File

@@ -1,8 +1,8 @@
/*
* ARM NEON accelerated ChaCha and XChaCha stream ciphers,
* ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
* including ChaCha20 (RFC7539)
*
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -20,8 +20,9 @@
*/
#include <crypto/algapi.h>
#include <crypto/chacha.h>
#include <crypto/internal/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -29,40 +30,78 @@
#include <asm/neon.h>
#include <asm/simd.h>
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
int nrounds, int bytes);
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes, int nrounds)
{
u8 buf[CHACHA_BLOCK_SIZE];
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
chacha_4block_xor_neon(state, dst, src, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
while (bytes >= CHACHA_BLOCK_SIZE) {
chacha_block_xor_neon(state, dst, src, nrounds);
bytes -= CHACHA_BLOCK_SIZE;
src += CHACHA_BLOCK_SIZE;
dst += CHACHA_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
memcpy(buf, src, bytes);
chacha_block_xor_neon(state, buf, buf, nrounds);
memcpy(dst, buf, bytes);
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
int bytes, int nrounds)
{
while (bytes > 0) {
int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
if (l <= CHACHA_BLOCK_SIZE) {
u8 buf[CHACHA_BLOCK_SIZE];
memcpy(buf, src, l);
chacha_block_xor_neon(state, buf, buf, nrounds);
memcpy(dst, buf, l);
state[12] += 1;
break;
}
chacha_4block_xor_neon(state, dst, src, nrounds, l);
bytes -= l;
src += l;
dst += l;
state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
}
}
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
{
if (!static_branch_likely(&have_neon) || !may_use_simd()) {
hchacha_block_generic(state, stream, nrounds);
} else {
kernel_neon_begin();
hchacha_block_neon(state, stream, nrounds);
kernel_neon_end();
}
}
EXPORT_SYMBOL(hchacha_block_arch);
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
{
chacha_init_generic(state, key, iv);
}
EXPORT_SYMBOL(chacha_init_arch);
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
int nrounds)
{
if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
!may_use_simd())
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
do {
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
kernel_neon_begin();
chacha_doneon(state, dst, src, todo, nrounds);
kernel_neon_end();
bytes -= todo;
src += todo;
dst += todo;
} while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);
static int chacha_neon_stream_xor(struct skcipher_request *req,
struct chacha_ctx *ctx, u8 *iv)
const struct chacha_ctx *ctx, const u8 *iv)
{
struct skcipher_walk walk;
u32 state[16];
@@ -70,18 +109,25 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
err = skcipher_walk_virt(&walk, req, false);
crypto_chacha_init(state, ctx, iv);
chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
nbytes = rounddown(nbytes, walk.stride);
kernel_neon_begin();
chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
nbytes, ctx->nrounds);
kernel_neon_end();
if (!static_branch_likely(&have_neon) ||
!may_use_simd()) {
chacha_crypt_generic(state, walk.dst.virt.addr,
walk.src.virt.addr, nbytes,
ctx->nrounds);
} else {
kernel_neon_begin();
chacha_doneon(state, walk.dst.virt.addr,
walk.src.virt.addr, nbytes, ctx->nrounds);
kernel_neon_end();
}
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
@@ -93,9 +139,6 @@ static int chacha_neon(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
return crypto_chacha_crypt(req);
return chacha_neon_stream_xor(req, ctx, req->iv);
}
@@ -107,14 +150,8 @@ static int xchacha_neon(struct skcipher_request *req)
u32 state[16];
u8 real_iv[16];
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
return crypto_xchacha_crypt(req);
crypto_chacha_init(state, ctx, req->iv);
kernel_neon_begin();
hchacha_block_neon(state, subctx.key, ctx->nrounds);
kernel_neon_end();
chacha_init_generic(state, ctx->key, req->iv);
hchacha_block_arch(state, subctx.key, ctx->nrounds);
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
@@ -135,8 +172,8 @@ static struct skcipher_alg algs[] = {
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.walksize = 5 * CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = chacha_neon,
.decrypt = chacha_neon,
}, {
@@ -151,8 +188,8 @@ static struct skcipher_alg algs[] = {
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.walksize = 5 * CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}, {
@@ -167,8 +204,8 @@ static struct skcipher_alg algs[] = {
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha12_setkey,
.walksize = 5 * CHACHA_BLOCK_SIZE,
.setkey = chacha12_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}
@@ -176,15 +213,19 @@ static struct skcipher_alg algs[] = {
static int __init chacha_simd_mod_init(void)
{
if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
if (!(elf_hwcap & HWCAP_ASIMD))
return 0;
return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
static_branch_enable(&have_neon);
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
}
static void __exit chacha_simd_mod_fini(void)
{
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && (elf_hwcap & HWCAP_ASIMD))
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
}
module_init(chacha_simd_mod_init);

View File

@@ -1,133 +0,0 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
*
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Based on:
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <crypto/algapi.h>
#include <crypto/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
u8 buf[CHACHA_BLOCK_SIZE];
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
kernel_neon_begin();
chacha20_4block_xor_neon(state, dst, src);
kernel_neon_end();
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
if (!bytes)
return;
kernel_neon_begin();
while (bytes >= CHACHA_BLOCK_SIZE) {
chacha20_block_xor_neon(state, dst, src);
bytes -= CHACHA_BLOCK_SIZE;
src += CHACHA_BLOCK_SIZE;
dst += CHACHA_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
memcpy(buf, src, bytes);
chacha20_block_xor_neon(state, buf, buf);
memcpy(dst, buf, bytes);
}
kernel_neon_end();
}
static int chacha20_neon(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
u32 state[16];
int err;
if (!may_use_simd() || req->cryptlen <= CHACHA_BLOCK_SIZE)
return crypto_chacha_crypt(req);
err = skcipher_walk_virt(&walk, req, false);
crypto_chacha_init(state, ctx, walk.iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
nbytes);
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
static struct skcipher_alg alg = {
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = chacha20_neon,
.decrypt = chacha20_neon,
};
static int __init chacha20_simd_mod_init(void)
{
if (!(elf_hwcap & HWCAP_ASIMD))
return -ENODEV;
return crypto_register_skcipher(&alg);
}
static void __exit chacha20_simd_mod_fini(void)
{
crypto_unregister_skcipher(&alg);
}
module_init(chacha20_simd_mod_init);
module_exit(chacha20_simd_mod_fini);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("chacha20");

View File

@@ -0,0 +1,913 @@
#!/usr/bin/env perl
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
#
# ====================================================================
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
# project.
# ====================================================================
#
# This module implements Poly1305 hash for ARMv8.
#
# June 2015
#
# Numbers are cycles per processed byte with poly1305_blocks alone.
#
# IALU/gcc-4.9 NEON
#
# Apple A7 1.86/+5% 0.72
# Cortex-A53 2.69/+58% 1.47
# Cortex-A57 2.70/+7% 1.14
# Denver 1.64/+50% 1.18(*)
# X-Gene 2.13/+68% 2.27
# Mongoose 1.77/+75% 1.12
# Kryo 2.70/+55% 1.13
# ThunderX2 1.17/+95% 1.36
#
# (*) estimate based on resources availability is less than 1.0,
# i.e. measured result is worse than expected, presumably binary
# translator is not almighty;
$flavour=shift;
$output=shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
my ($mac,$nonce)=($inp,$len);
my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
$code.=<<___;
#ifndef __KERNEL__
# include "arm_arch.h"
.extern OPENSSL_armcap_P
#endif
.text
// forward "declarations" are required for Apple
.globl poly1305_blocks
.globl poly1305_emit
.globl poly1305_init
.type poly1305_init,%function
.align 5
poly1305_init:
cmp $inp,xzr
stp xzr,xzr,[$ctx] // zero hash value
stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
csel x0,xzr,x0,eq
b.eq .Lno_key
#ifndef __KERNEL__
adrp x17,OPENSSL_armcap_P
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
#endif
ldp $r0,$r1,[$inp] // load key
mov $s1,#0xfffffffc0fffffff
movk $s1,#0x0fff,lsl#48
#ifdef __AARCH64EB__
rev $r0,$r0 // flip bytes
rev $r1,$r1
#endif
and $r0,$r0,$s1 // &=0ffffffc0fffffff
and $s1,$s1,#-4
and $r1,$r1,$s1 // &=0ffffffc0ffffffc
mov w#$s1,#-1
stp $r0,$r1,[$ctx,#32] // save key value
str w#$s1,[$ctx,#48] // impossible key power value
#ifndef __KERNEL__
tst w17,#ARMV7_NEON
adr $d0,.Lpoly1305_blocks
adr $r0,.Lpoly1305_blocks_neon
adr $d1,.Lpoly1305_emit
csel $d0,$d0,$r0,eq
# ifdef __ILP32__
stp w#$d0,w#$d1,[$len]
# else
stp $d0,$d1,[$len]
# endif
#endif
mov x0,#1
.Lno_key:
ret
.size poly1305_init,.-poly1305_init
.type poly1305_blocks,%function
.align 5
poly1305_blocks:
.Lpoly1305_blocks:
ands $len,$len,#-16
b.eq .Lno_data
ldp $h0,$h1,[$ctx] // load hash value
ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
ldp $r0,$r1,[$ctx,#32] // load key value
#ifdef __AARCH64EB__
lsr $d0,$h0,#32
mov w#$d1,w#$h0
lsr $d2,$h1,#32
mov w15,w#$h1
lsr x16,$h2,#32
#else
mov w#$d0,w#$h0
lsr $d1,$h0,#32
mov w#$d2,w#$h1
lsr x15,$h1,#32
mov w16,w#$h2
#endif
add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
lsr $d1,$d2,#12
adds $d0,$d0,$d2,lsl#52
add $d1,$d1,x15,lsl#14
adc $d1,$d1,xzr
lsr $d2,x16,#24
adds $d1,$d1,x16,lsl#40
adc $d2,$d2,xzr
cmp x17,#0 // is_base2_26?
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
csel $h0,$h0,$d0,eq // choose between radixes
csel $h1,$h1,$d1,eq
csel $h2,$h2,$d2,eq
.Loop:
ldp $t0,$t1,[$inp],#16 // load input
sub $len,$len,#16
#ifdef __AARCH64EB__
rev $t0,$t0
rev $t1,$t1
#endif
adds $h0,$h0,$t0 // accumulate input
adcs $h1,$h1,$t1
mul $d0,$h0,$r0 // h0*r0
adc $h2,$h2,$padbit
umulh $d1,$h0,$r0
mul $t0,$h1,$s1 // h1*5*r1
umulh $t1,$h1,$s1
adds $d0,$d0,$t0
mul $t0,$h0,$r1 // h0*r1
adc $d1,$d1,$t1
umulh $d2,$h0,$r1
adds $d1,$d1,$t0
mul $t0,$h1,$r0 // h1*r0
adc $d2,$d2,xzr
umulh $t1,$h1,$r0
adds $d1,$d1,$t0
mul $t0,$h2,$s1 // h2*5*r1
adc $d2,$d2,$t1
mul $t1,$h2,$r0 // h2*r0
adds $d1,$d1,$t0
adc $d2,$d2,$t1
and $t0,$d2,#-4 // final reduction
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$d0,$t0
adcs $h1,$d1,xzr
adc $h2,$h2,xzr
cbnz $len,.Loop
stp $h0,$h1,[$ctx] // store hash value
stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
.Lno_data:
ret
.size poly1305_blocks,.-poly1305_blocks
.type poly1305_emit,%function
.align 5
poly1305_emit:
.Lpoly1305_emit:
ldp $h0,$h1,[$ctx] // load hash base 2^64
ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
ldp $t0,$t1,[$nonce] // load nonce
#ifdef __AARCH64EB__
lsr $d0,$h0,#32
mov w#$d1,w#$h0
lsr $d2,$h1,#32
mov w15,w#$h1
lsr x16,$h2,#32
#else
mov w#$d0,w#$h0
lsr $d1,$h0,#32
mov w#$d2,w#$h1
lsr x15,$h1,#32
mov w16,w#$h2
#endif
add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
lsr $d1,$d2,#12
adds $d0,$d0,$d2,lsl#52
add $d1,$d1,x15,lsl#14
adc $d1,$d1,xzr
lsr $d2,x16,#24
adds $d1,$d1,x16,lsl#40
adc $d2,$d2,xzr
cmp $r0,#0 // is_base2_26?
csel $h0,$h0,$d0,eq // choose between radixes
csel $h1,$h1,$d1,eq
csel $h2,$h2,$d2,eq
adds $d0,$h0,#5 // compare to modulus
adcs $d1,$h1,xzr
adc $d2,$h2,xzr
tst $d2,#-4 // see if it's carried/borrowed
csel $h0,$h0,$d0,eq
csel $h1,$h1,$d1,eq
#ifdef __AARCH64EB__
ror $t0,$t0,#32 // flip nonce words
ror $t1,$t1,#32
#endif
adds $h0,$h0,$t0 // accumulate nonce
adc $h1,$h1,$t1
#ifdef __AARCH64EB__
rev $h0,$h0 // flip output bytes
rev $h1,$h1
#endif
stp $h0,$h1,[$mac] // write result
ret
.size poly1305_emit,.-poly1305_emit
___
my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
my ($T0,$T1,$MASK) = map("v$_",(29..31));
my ($in2,$zeros)=("x16","x17");
my $is_base2_26 = $zeros; # borrow
$code.=<<___;
.type poly1305_mult,%function
.align 5
poly1305_mult:
mul $d0,$h0,$r0 // h0*r0
umulh $d1,$h0,$r0
mul $t0,$h1,$s1 // h1*5*r1
umulh $t1,$h1,$s1
adds $d0,$d0,$t0
mul $t0,$h0,$r1 // h0*r1
adc $d1,$d1,$t1
umulh $d2,$h0,$r1
adds $d1,$d1,$t0
mul $t0,$h1,$r0 // h1*r0
adc $d2,$d2,xzr
umulh $t1,$h1,$r0
adds $d1,$d1,$t0
mul $t0,$h2,$s1 // h2*5*r1
adc $d2,$d2,$t1
mul $t1,$h2,$r0 // h2*r0
adds $d1,$d1,$t0
adc $d2,$d2,$t1
and $t0,$d2,#-4 // final reduction
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$d0,$t0
adcs $h1,$d1,xzr
adc $h2,$h2,xzr
ret
.size poly1305_mult,.-poly1305_mult
.type poly1305_splat,%function
.align 4
poly1305_splat:
and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
ubfx x13,$h0,#26,#26
extr x14,$h1,$h0,#52
and x14,x14,#0x03ffffff
ubfx x15,$h1,#14,#26
extr x16,$h2,$h1,#40
str w12,[$ctx,#16*0] // r0
add w12,w13,w13,lsl#2 // r1*5
str w13,[$ctx,#16*1] // r1
add w13,w14,w14,lsl#2 // r2*5
str w12,[$ctx,#16*2] // s1
str w14,[$ctx,#16*3] // r2
add w14,w15,w15,lsl#2 // r3*5
str w13,[$ctx,#16*4] // s2
str w15,[$ctx,#16*5] // r3
add w15,w16,w16,lsl#2 // r4*5
str w14,[$ctx,#16*6] // s3
str w16,[$ctx,#16*7] // r4
str w15,[$ctx,#16*8] // s4
ret
.size poly1305_splat,.-poly1305_splat
#ifdef __KERNEL__
.globl poly1305_blocks_neon
#endif
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
.Lpoly1305_blocks_neon:
ldr $is_base2_26,[$ctx,#24]
cmp $len,#128
b.lo .Lpoly1305_blocks
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-80]!
add x29,sp,#0
stp d8,d9,[sp,#16] // meet ABI requirements
stp d10,d11,[sp,#32]
stp d12,d13,[sp,#48]
stp d14,d15,[sp,#64]
cbz $is_base2_26,.Lbase2_64_neon
ldp w10,w11,[$ctx] // load hash value base 2^26
ldp w12,w13,[$ctx,#8]
ldr w14,[$ctx,#16]
tst $len,#31
b.eq .Leven_neon
ldp $r0,$r1,[$ctx,#32] // load key value
add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
lsr $h1,x12,#12
adds $h0,$h0,x12,lsl#52
add $h1,$h1,x13,lsl#14
adc $h1,$h1,xzr
lsr $h2,x14,#24
adds $h1,$h1,x14,lsl#40
adc $d2,$h2,xzr // can be partially reduced...
ldp $d0,$d1,[$inp],#16 // load input
sub $len,$len,#16
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
#ifdef __AARCH64EB__
rev $d0,$d0
rev $d1,$d1
#endif
adds $h0,$h0,$d0 // accumulate input
adcs $h1,$h1,$d1
adc $h2,$h2,$padbit
bl poly1305_mult
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,$h0,#26,#26
extr x12,$h1,$h0,#52
and x12,x12,#0x03ffffff
ubfx x13,$h1,#14,#26
extr x14,$h2,$h1,#40
b .Leven_neon
.align 4
.Lbase2_64_neon:
ldp $r0,$r1,[$ctx,#32] // load key value
ldp $h0,$h1,[$ctx] // load hash value base 2^64
ldr $h2,[$ctx,#16]
tst $len,#31
b.eq .Linit_neon
ldp $d0,$d1,[$inp],#16 // load input
sub $len,$len,#16
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
#ifdef __AARCH64EB__
rev $d0,$d0
rev $d1,$d1
#endif
adds $h0,$h0,$d0 // accumulate input
adcs $h1,$h1,$d1
adc $h2,$h2,$padbit
bl poly1305_mult
.Linit_neon:
ldr w17,[$ctx,#48] // first table element
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,$h0,#26,#26
extr x12,$h1,$h0,#52
and x12,x12,#0x03ffffff
ubfx x13,$h1,#14,#26
extr x14,$h2,$h1,#40
cmp w17,#-1 // is value impossible?
b.ne .Leven_neon
fmov ${H0},x10
fmov ${H1},x11
fmov ${H2},x12
fmov ${H3},x13
fmov ${H4},x14
////////////////////////////////// initialize r^n table
mov $h0,$r0 // r^1
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
mov $h1,$r1
mov $h2,xzr
add $ctx,$ctx,#48+12
bl poly1305_splat
bl poly1305_mult // r^2
sub $ctx,$ctx,#4
bl poly1305_splat
bl poly1305_mult // r^3
sub $ctx,$ctx,#4
bl poly1305_splat
bl poly1305_mult // r^4
sub $ctx,$ctx,#4
bl poly1305_splat
sub $ctx,$ctx,#48 // restore original $ctx
b .Ldo_neon
.align 4
.Leven_neon:
fmov ${H0},x10
fmov ${H1},x11
fmov ${H2},x12
fmov ${H3},x13
fmov ${H4},x14
.Ldo_neon:
ldp x8,x12,[$inp,#32] // inp[2:3]
subs $len,$len,#64
ldp x9,x13,[$inp,#48]
add $in2,$inp,#96
adr $zeros,.Lzeros
lsl $padbit,$padbit,#24
add x15,$ctx,#48
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov $IN23_0,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,$padbit,x12,lsr#40
add x13,$padbit,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov $IN23_1,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
fmov $IN23_2,x8
fmov $IN23_3,x10
fmov $IN23_4,x12
ldp x8,x12,[$inp],#16 // inp[0:1]
ldp x9,x13,[$inp],#48
ld1 {$R0,$R1,$S1,$R2},[x15],#64
ld1 {$S2,$R3,$S3,$R4},[x15],#64
ld1 {$S4},[x15]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov $IN01_0,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,$padbit,x12,lsr#40
add x13,$padbit,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov $IN01_1,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
movi $MASK.2d,#-1
fmov $IN01_2,x8
fmov $IN01_3,x10
fmov $IN01_4,x12
ushr $MASK.2d,$MASK.2d,#38
b.ls .Lskip_loop
.align 4
.Loop_neon:
////////////////////////////////////////////////////////////////
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
// \___________________/
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
// \___________________/ \____________________/
//
// Note that we start with inp[2:3]*r^2. This is because it
// doesn't depend on reduction in previous iteration.
////////////////////////////////////////////////////////////////
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
subs $len,$len,#64
umull $ACC4,$IN23_0,${R4}[2]
csel $in2,$zeros,$in2,lo
umull $ACC3,$IN23_0,${R3}[2]
umull $ACC2,$IN23_0,${R2}[2]
ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
umull $ACC1,$IN23_0,${R1}[2]
ldp x9,x13,[$in2],#48
umull $ACC0,$IN23_0,${R0}[2]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
umlal $ACC4,$IN23_1,${R3}[2]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal $ACC3,$IN23_1,${R2}[2]
and x5,x9,#0x03ffffff
umlal $ACC2,$IN23_1,${R1}[2]
ubfx x6,x8,#26,#26
umlal $ACC1,$IN23_1,${R0}[2]
ubfx x7,x9,#26,#26
umlal $ACC0,$IN23_1,${S4}[2]
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal $ACC4,$IN23_2,${R2}[2]
extr x8,x12,x8,#52
umlal $ACC3,$IN23_2,${R1}[2]
extr x9,x13,x9,#52
umlal $ACC2,$IN23_2,${R0}[2]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal $ACC1,$IN23_2,${S4}[2]
fmov $IN23_0,x4
umlal $ACC0,$IN23_2,${S3}[2]
and x8,x8,#0x03ffffff
umlal $ACC4,$IN23_3,${R1}[2]
and x9,x9,#0x03ffffff
umlal $ACC3,$IN23_3,${R0}[2]
ubfx x10,x12,#14,#26
umlal $ACC2,$IN23_3,${S4}[2]
ubfx x11,x13,#14,#26
umlal $ACC1,$IN23_3,${S3}[2]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal $ACC0,$IN23_3,${S2}[2]
fmov $IN23_1,x6
add $IN01_2,$IN01_2,$H2
add x12,$padbit,x12,lsr#40
umlal $ACC4,$IN23_4,${R0}[2]
add x13,$padbit,x13,lsr#40
umlal $ACC3,$IN23_4,${S4}[2]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal $ACC2,$IN23_4,${S3}[2]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal $ACC1,$IN23_4,${S2}[2]
fmov $IN23_2,x8
umlal $ACC0,$IN23_4,${S1}[2]
fmov $IN23_3,x10
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4 and accumulate
add $IN01_0,$IN01_0,$H0
fmov $IN23_4,x12
umlal $ACC3,$IN01_2,${R1}[0]
ldp x8,x12,[$inp],#16 // inp[0:1]
umlal $ACC0,$IN01_2,${S3}[0]
ldp x9,x13,[$inp],#48
umlal $ACC4,$IN01_2,${R2}[0]
umlal $ACC1,$IN01_2,${S4}[0]
umlal $ACC2,$IN01_2,${R0}[0]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
add $IN01_1,$IN01_1,$H1
umlal $ACC3,$IN01_0,${R3}[0]
umlal $ACC4,$IN01_0,${R4}[0]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal $ACC2,$IN01_0,${R2}[0]
and x5,x9,#0x03ffffff
umlal $ACC0,$IN01_0,${R0}[0]
ubfx x6,x8,#26,#26
umlal $ACC1,$IN01_0,${R1}[0]
ubfx x7,x9,#26,#26
add $IN01_3,$IN01_3,$H3
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal $ACC3,$IN01_1,${R2}[0]
extr x8,x12,x8,#52
umlal $ACC4,$IN01_1,${R3}[0]
extr x9,x13,x9,#52
umlal $ACC0,$IN01_1,${S4}[0]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal $ACC2,$IN01_1,${R1}[0]
fmov $IN01_0,x4
umlal $ACC1,$IN01_1,${R0}[0]
and x8,x8,#0x03ffffff
add $IN01_4,$IN01_4,$H4
and x9,x9,#0x03ffffff
umlal $ACC3,$IN01_3,${R0}[0]
ubfx x10,x12,#14,#26
umlal $ACC0,$IN01_3,${S2}[0]
ubfx x11,x13,#14,#26
umlal $ACC4,$IN01_3,${R1}[0]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal $ACC1,$IN01_3,${S3}[0]
fmov $IN01_1,x6
umlal $ACC2,$IN01_3,${S4}[0]
add x12,$padbit,x12,lsr#40
umlal $ACC3,$IN01_4,${S4}[0]
add x13,$padbit,x13,lsr#40
umlal $ACC0,$IN01_4,${S1}[0]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal $ACC4,$IN01_4,${R0}[0]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal $ACC1,$IN01_4,${S2}[0]
fmov $IN01_2,x8
umlal $ACC2,$IN01_4,${S3}[0]
fmov $IN01_3,x10
fmov $IN01_4,x12
/////////////////////////////////////////////////////////////////
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
// and P. Schwabe
//
// [see discussion in poly1305-armv4 module]
ushr $T0.2d,$ACC3,#26
xtn $H3,$ACC3
ushr $T1.2d,$ACC0,#26
and $ACC0,$ACC0,$MASK.2d
add $ACC4,$ACC4,$T0.2d // h3 -> h4
bic $H3,#0xfc,lsl#24 // &=0x03ffffff
add $ACC1,$ACC1,$T1.2d // h0 -> h1
ushr $T0.2d,$ACC4,#26
xtn $H4,$ACC4
ushr $T1.2d,$ACC1,#26
xtn $H1,$ACC1
bic $H4,#0xfc,lsl#24
add $ACC2,$ACC2,$T1.2d // h1 -> h2
add $ACC0,$ACC0,$T0.2d
shl $T0.2d,$T0.2d,#2
shrn $T1.2s,$ACC2,#26
xtn $H2,$ACC2
add $ACC0,$ACC0,$T0.2d // h4 -> h0
bic $H1,#0xfc,lsl#24
add $H3,$H3,$T1.2s // h2 -> h3
bic $H2,#0xfc,lsl#24
shrn $T0.2s,$ACC0,#26
xtn $H0,$ACC0
ushr $T1.2s,$H3,#26
bic $H3,#0xfc,lsl#24
bic $H0,#0xfc,lsl#24
add $H1,$H1,$T0.2s // h0 -> h1
add $H4,$H4,$T1.2s // h3 -> h4
b.hi .Loop_neon
.Lskip_loop:
dup $IN23_2,${IN23_2}[0]
add $IN01_2,$IN01_2,$H2
////////////////////////////////////////////////////////////////
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
adds $len,$len,#32
b.ne .Long_tail
dup $IN23_2,${IN01_2}[0]
add $IN23_0,$IN01_0,$H0
add $IN23_3,$IN01_3,$H3
add $IN23_1,$IN01_1,$H1
add $IN23_4,$IN01_4,$H4
.Long_tail:
dup $IN23_0,${IN23_0}[0]
umull2 $ACC0,$IN23_2,${S3}
umull2 $ACC3,$IN23_2,${R1}
umull2 $ACC4,$IN23_2,${R2}
umull2 $ACC2,$IN23_2,${R0}
umull2 $ACC1,$IN23_2,${S4}
dup $IN23_1,${IN23_1}[0]
umlal2 $ACC0,$IN23_0,${R0}
umlal2 $ACC2,$IN23_0,${R2}
umlal2 $ACC3,$IN23_0,${R3}
umlal2 $ACC4,$IN23_0,${R4}
umlal2 $ACC1,$IN23_0,${R1}
dup $IN23_3,${IN23_3}[0]
umlal2 $ACC0,$IN23_1,${S4}
umlal2 $ACC3,$IN23_1,${R2}
umlal2 $ACC2,$IN23_1,${R1}
umlal2 $ACC4,$IN23_1,${R3}
umlal2 $ACC1,$IN23_1,${R0}
dup $IN23_4,${IN23_4}[0]
umlal2 $ACC3,$IN23_3,${R0}
umlal2 $ACC4,$IN23_3,${R1}
umlal2 $ACC0,$IN23_3,${S2}
umlal2 $ACC1,$IN23_3,${S3}
umlal2 $ACC2,$IN23_3,${S4}
umlal2 $ACC3,$IN23_4,${S4}
umlal2 $ACC0,$IN23_4,${S1}
umlal2 $ACC4,$IN23_4,${R0}
umlal2 $ACC1,$IN23_4,${S2}
umlal2 $ACC2,$IN23_4,${S3}
b.eq .Lshort_tail
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4:r^3 and accumulate
add $IN01_0,$IN01_0,$H0
umlal $ACC3,$IN01_2,${R1}
umlal $ACC0,$IN01_2,${S3}
umlal $ACC4,$IN01_2,${R2}
umlal $ACC1,$IN01_2,${S4}
umlal $ACC2,$IN01_2,${R0}
add $IN01_1,$IN01_1,$H1
umlal $ACC3,$IN01_0,${R3}
umlal $ACC0,$IN01_0,${R0}
umlal $ACC4,$IN01_0,${R4}
umlal $ACC1,$IN01_0,${R1}
umlal $ACC2,$IN01_0,${R2}
add $IN01_3,$IN01_3,$H3
umlal $ACC3,$IN01_1,${R2}
umlal $ACC0,$IN01_1,${S4}
umlal $ACC4,$IN01_1,${R3}
umlal $ACC1,$IN01_1,${R0}
umlal $ACC2,$IN01_1,${R1}
add $IN01_4,$IN01_4,$H4
umlal $ACC3,$IN01_3,${R0}
umlal $ACC0,$IN01_3,${S2}
umlal $ACC4,$IN01_3,${R1}
umlal $ACC1,$IN01_3,${S3}
umlal $ACC2,$IN01_3,${S4}
umlal $ACC3,$IN01_4,${S4}
umlal $ACC0,$IN01_4,${S1}
umlal $ACC4,$IN01_4,${R0}
umlal $ACC1,$IN01_4,${S2}
umlal $ACC2,$IN01_4,${S3}
.Lshort_tail:
////////////////////////////////////////////////////////////////
// horizontal add
addp $ACC3,$ACC3,$ACC3
ldp d8,d9,[sp,#16] // meet ABI requirements
addp $ACC0,$ACC0,$ACC0
ldp d10,d11,[sp,#32]
addp $ACC4,$ACC4,$ACC4
ldp d12,d13,[sp,#48]
addp $ACC1,$ACC1,$ACC1
ldp d14,d15,[sp,#64]
addp $ACC2,$ACC2,$ACC2
ldr x30,[sp,#8]
////////////////////////////////////////////////////////////////
// lazy reduction, but without narrowing
ushr $T0.2d,$ACC3,#26
and $ACC3,$ACC3,$MASK.2d
ushr $T1.2d,$ACC0,#26
and $ACC0,$ACC0,$MASK.2d
add $ACC4,$ACC4,$T0.2d // h3 -> h4
add $ACC1,$ACC1,$T1.2d // h0 -> h1
ushr $T0.2d,$ACC4,#26
and $ACC4,$ACC4,$MASK.2d
ushr $T1.2d,$ACC1,#26
and $ACC1,$ACC1,$MASK.2d
add $ACC2,$ACC2,$T1.2d // h1 -> h2
add $ACC0,$ACC0,$T0.2d
shl $T0.2d,$T0.2d,#2
ushr $T1.2d,$ACC2,#26
and $ACC2,$ACC2,$MASK.2d
add $ACC0,$ACC0,$T0.2d // h4 -> h0
add $ACC3,$ACC3,$T1.2d // h2 -> h3
ushr $T0.2d,$ACC0,#26
and $ACC0,$ACC0,$MASK.2d
ushr $T1.2d,$ACC3,#26
and $ACC3,$ACC3,$MASK.2d
add $ACC1,$ACC1,$T0.2d // h0 -> h1
add $ACC4,$ACC4,$T1.2d // h3 -> h4
////////////////////////////////////////////////////////////////
// write the result, can be partially reduced
st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
mov x4,#1
st1 {$ACC4}[0],[$ctx]
str x4,[$ctx,#8] // set is_base2_26
ldr x29,[sp],#80
.inst 0xd50323bf // autiasp
ret
.size poly1305_blocks_neon,.-poly1305_blocks_neon
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
.align 2
#if !defined(__KERNEL__) && !defined(_WIN64)
.comm OPENSSL_armcap_P,4,4
.hidden OPENSSL_armcap_P
#endif
___
foreach (split("\n",$code)) {
s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
s/\.[124]([sd])\[/.$1\[/;
s/w#x([0-9]+)/w$1/g;
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,835 @@
#ifndef __KERNEL__
# include "arm_arch.h"
.extern OPENSSL_armcap_P
#endif
.text
// forward "declarations" are required for Apple
.globl poly1305_blocks
.globl poly1305_emit
.globl poly1305_init
.type poly1305_init,%function
.align 5
poly1305_init:
cmp x1,xzr
stp xzr,xzr,[x0] // zero hash value
stp xzr,xzr,[x0,#16] // [along with is_base2_26]
csel x0,xzr,x0,eq
b.eq .Lno_key
#ifndef __KERNEL__
adrp x17,OPENSSL_armcap_P
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
#endif
ldp x7,x8,[x1] // load key
mov x9,#0xfffffffc0fffffff
movk x9,#0x0fff,lsl#48
#ifdef __AARCH64EB__
rev x7,x7 // flip bytes
rev x8,x8
#endif
and x7,x7,x9 // &=0ffffffc0fffffff
and x9,x9,#-4
and x8,x8,x9 // &=0ffffffc0ffffffc
mov w9,#-1
stp x7,x8,[x0,#32] // save key value
str w9,[x0,#48] // impossible key power value
#ifndef __KERNEL__
tst w17,#ARMV7_NEON
adr x12,.Lpoly1305_blocks
adr x7,.Lpoly1305_blocks_neon
adr x13,.Lpoly1305_emit
csel x12,x12,x7,eq
# ifdef __ILP32__
stp w12,w13,[x2]
# else
stp x12,x13,[x2]
# endif
#endif
mov x0,#1
.Lno_key:
ret
.size poly1305_init,.-poly1305_init
.type poly1305_blocks,%function
.align 5
poly1305_blocks:
.Lpoly1305_blocks:
ands x2,x2,#-16
b.eq .Lno_data
ldp x4,x5,[x0] // load hash value
ldp x6,x17,[x0,#16] // [along with is_base2_26]
ldp x7,x8,[x0,#32] // load key value
#ifdef __AARCH64EB__
lsr x12,x4,#32
mov w13,w4
lsr x14,x5,#32
mov w15,w5
lsr x16,x6,#32
#else
mov w12,w4
lsr x13,x4,#32
mov w14,w5
lsr x15,x5,#32
mov w16,w6
#endif
add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
lsr x13,x14,#12
adds x12,x12,x14,lsl#52
add x13,x13,x15,lsl#14
adc x13,x13,xzr
lsr x14,x16,#24
adds x13,x13,x16,lsl#40
adc x14,x14,xzr
cmp x17,#0 // is_base2_26?
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
csel x4,x4,x12,eq // choose between radixes
csel x5,x5,x13,eq
csel x6,x6,x14,eq
.Loop:
ldp x10,x11,[x1],#16 // load input
sub x2,x2,#16
#ifdef __AARCH64EB__
rev x10,x10
rev x11,x11
#endif
adds x4,x4,x10 // accumulate input
adcs x5,x5,x11
mul x12,x4,x7 // h0*r0
adc x6,x6,x3
umulh x13,x4,x7
mul x10,x5,x9 // h1*5*r1
umulh x11,x5,x9
adds x12,x12,x10
mul x10,x4,x8 // h0*r1
adc x13,x13,x11
umulh x14,x4,x8
adds x13,x13,x10
mul x10,x5,x7 // h1*r0
adc x14,x14,xzr
umulh x11,x5,x7
adds x13,x13,x10
mul x10,x6,x9 // h2*5*r1
adc x14,x14,x11
mul x11,x6,x7 // h2*r0
adds x13,x13,x10
adc x14,x14,x11
and x10,x14,#-4 // final reduction
and x6,x14,#3
add x10,x10,x14,lsr#2
adds x4,x12,x10
adcs x5,x13,xzr
adc x6,x6,xzr
cbnz x2,.Loop
stp x4,x5,[x0] // store hash value
stp x6,xzr,[x0,#16] // [and clear is_base2_26]
.Lno_data:
ret
.size poly1305_blocks,.-poly1305_blocks
.type poly1305_emit,%function
.align 5
poly1305_emit:
.Lpoly1305_emit:
ldp x4,x5,[x0] // load hash base 2^64
ldp x6,x7,[x0,#16] // [along with is_base2_26]
ldp x10,x11,[x2] // load nonce
#ifdef __AARCH64EB__
lsr x12,x4,#32
mov w13,w4
lsr x14,x5,#32
mov w15,w5
lsr x16,x6,#32
#else
mov w12,w4
lsr x13,x4,#32
mov w14,w5
lsr x15,x5,#32
mov w16,w6
#endif
add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
lsr x13,x14,#12
adds x12,x12,x14,lsl#52
add x13,x13,x15,lsl#14
adc x13,x13,xzr
lsr x14,x16,#24
adds x13,x13,x16,lsl#40
adc x14,x14,xzr
cmp x7,#0 // is_base2_26?
csel x4,x4,x12,eq // choose between radixes
csel x5,x5,x13,eq
csel x6,x6,x14,eq
adds x12,x4,#5 // compare to modulus
adcs x13,x5,xzr
adc x14,x6,xzr
tst x14,#-4 // see if it's carried/borrowed
csel x4,x4,x12,eq
csel x5,x5,x13,eq
#ifdef __AARCH64EB__
ror x10,x10,#32 // flip nonce words
ror x11,x11,#32
#endif
adds x4,x4,x10 // accumulate nonce
adc x5,x5,x11
#ifdef __AARCH64EB__
rev x4,x4 // flip output bytes
rev x5,x5
#endif
stp x4,x5,[x1] // write result
ret
.size poly1305_emit,.-poly1305_emit
.type poly1305_mult,%function
.align 5
poly1305_mult:
mul x12,x4,x7 // h0*r0
umulh x13,x4,x7
mul x10,x5,x9 // h1*5*r1
umulh x11,x5,x9
adds x12,x12,x10
mul x10,x4,x8 // h0*r1
adc x13,x13,x11
umulh x14,x4,x8
adds x13,x13,x10
mul x10,x5,x7 // h1*r0
adc x14,x14,xzr
umulh x11,x5,x7
adds x13,x13,x10
mul x10,x6,x9 // h2*5*r1
adc x14,x14,x11
mul x11,x6,x7 // h2*r0
adds x13,x13,x10
adc x14,x14,x11
and x10,x14,#-4 // final reduction
and x6,x14,#3
add x10,x10,x14,lsr#2
adds x4,x12,x10
adcs x5,x13,xzr
adc x6,x6,xzr
ret
.size poly1305_mult,.-poly1305_mult
.type poly1305_splat,%function
.align 4
poly1305_splat:
and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x13,x4,#26,#26
extr x14,x5,x4,#52
and x14,x14,#0x03ffffff
ubfx x15,x5,#14,#26
extr x16,x6,x5,#40
str w12,[x0,#16*0] // r0
add w12,w13,w13,lsl#2 // r1*5
str w13,[x0,#16*1] // r1
add w13,w14,w14,lsl#2 // r2*5
str w12,[x0,#16*2] // s1
str w14,[x0,#16*3] // r2
add w14,w15,w15,lsl#2 // r3*5
str w13,[x0,#16*4] // s2
str w15,[x0,#16*5] // r3
add w15,w16,w16,lsl#2 // r4*5
str w14,[x0,#16*6] // s3
str w16,[x0,#16*7] // r4
str w15,[x0,#16*8] // s4
ret
.size poly1305_splat,.-poly1305_splat
#ifdef __KERNEL__
.globl poly1305_blocks_neon
#endif
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
.Lpoly1305_blocks_neon:
ldr x17,[x0,#24]
cmp x2,#128
b.lo .Lpoly1305_blocks
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-80]!
add x29,sp,#0
stp d8,d9,[sp,#16] // meet ABI requirements
stp d10,d11,[sp,#32]
stp d12,d13,[sp,#48]
stp d14,d15,[sp,#64]
cbz x17,.Lbase2_64_neon
ldp w10,w11,[x0] // load hash value base 2^26
ldp w12,w13,[x0,#8]
ldr w14,[x0,#16]
tst x2,#31
b.eq .Leven_neon
ldp x7,x8,[x0,#32] // load key value
add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
lsr x5,x12,#12
adds x4,x4,x12,lsl#52
add x5,x5,x13,lsl#14
adc x5,x5,xzr
lsr x6,x14,#24
adds x5,x5,x14,lsl#40
adc x14,x6,xzr // can be partially reduced...
ldp x12,x13,[x1],#16 // load input
sub x2,x2,#16
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
#ifdef __AARCH64EB__
rev x12,x12
rev x13,x13
#endif
adds x4,x4,x12 // accumulate input
adcs x5,x5,x13
adc x6,x6,x3
bl poly1305_mult
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,x4,#26,#26
extr x12,x5,x4,#52
and x12,x12,#0x03ffffff
ubfx x13,x5,#14,#26
extr x14,x6,x5,#40
b .Leven_neon
.align 4
.Lbase2_64_neon:
ldp x7,x8,[x0,#32] // load key value
ldp x4,x5,[x0] // load hash value base 2^64
ldr x6,[x0,#16]
tst x2,#31
b.eq .Linit_neon
ldp x12,x13,[x1],#16 // load input
sub x2,x2,#16
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
#ifdef __AARCH64EB__
rev x12,x12
rev x13,x13
#endif
adds x4,x4,x12 // accumulate input
adcs x5,x5,x13
adc x6,x6,x3
bl poly1305_mult
.Linit_neon:
ldr w17,[x0,#48] // first table element
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,x4,#26,#26
extr x12,x5,x4,#52
and x12,x12,#0x03ffffff
ubfx x13,x5,#14,#26
extr x14,x6,x5,#40
cmp w17,#-1 // is value impossible?
b.ne .Leven_neon
fmov d24,x10
fmov d25,x11
fmov d26,x12
fmov d27,x13
fmov d28,x14
////////////////////////////////// initialize r^n table
mov x4,x7 // r^1
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
mov x5,x8
mov x6,xzr
add x0,x0,#48+12
bl poly1305_splat
bl poly1305_mult // r^2
sub x0,x0,#4
bl poly1305_splat
bl poly1305_mult // r^3
sub x0,x0,#4
bl poly1305_splat
bl poly1305_mult // r^4
sub x0,x0,#4
bl poly1305_splat
sub x0,x0,#48 // restore original x0
b .Ldo_neon
.align 4
.Leven_neon:
fmov d24,x10
fmov d25,x11
fmov d26,x12
fmov d27,x13
fmov d28,x14
.Ldo_neon:
ldp x8,x12,[x1,#32] // inp[2:3]
subs x2,x2,#64
ldp x9,x13,[x1,#48]
add x16,x1,#96
adr x17,.Lzeros
lsl x3,x3,#24
add x15,x0,#48
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov d14,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,x3,x12,lsr#40
add x13,x3,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov d15,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
fmov d16,x8
fmov d17,x10
fmov d18,x12
ldp x8,x12,[x1],#16 // inp[0:1]
ldp x9,x13,[x1],#48
ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
ld1 {v8.4s},[x15]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov d9,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,x3,x12,lsr#40
add x13,x3,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov d10,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
movi v31.2d,#-1
fmov d11,x8
fmov d12,x10
fmov d13,x12
ushr v31.2d,v31.2d,#38
b.ls .Lskip_loop
.align 4
.Loop_neon:
////////////////////////////////////////////////////////////////
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
// ___________________/
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
// ___________________/ ____________________/
//
// Note that we start with inp[2:3]*r^2. This is because it
// doesn't depend on reduction in previous iteration.
////////////////////////////////////////////////////////////////
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
subs x2,x2,#64
umull v23.2d,v14.2s,v7.s[2]
csel x16,x17,x16,lo
umull v22.2d,v14.2s,v5.s[2]
umull v21.2d,v14.2s,v3.s[2]
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
umull v20.2d,v14.2s,v1.s[2]
ldp x9,x13,[x16],#48
umull v19.2d,v14.2s,v0.s[2]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
umlal v23.2d,v15.2s,v5.s[2]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal v22.2d,v15.2s,v3.s[2]
and x5,x9,#0x03ffffff
umlal v21.2d,v15.2s,v1.s[2]
ubfx x6,x8,#26,#26
umlal v20.2d,v15.2s,v0.s[2]
ubfx x7,x9,#26,#26
umlal v19.2d,v15.2s,v8.s[2]
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal v23.2d,v16.2s,v3.s[2]
extr x8,x12,x8,#52
umlal v22.2d,v16.2s,v1.s[2]
extr x9,x13,x9,#52
umlal v21.2d,v16.2s,v0.s[2]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal v20.2d,v16.2s,v8.s[2]
fmov d14,x4
umlal v19.2d,v16.2s,v6.s[2]
and x8,x8,#0x03ffffff
umlal v23.2d,v17.2s,v1.s[2]
and x9,x9,#0x03ffffff
umlal v22.2d,v17.2s,v0.s[2]
ubfx x10,x12,#14,#26
umlal v21.2d,v17.2s,v8.s[2]
ubfx x11,x13,#14,#26
umlal v20.2d,v17.2s,v6.s[2]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal v19.2d,v17.2s,v4.s[2]
fmov d15,x6
add v11.2s,v11.2s,v26.2s
add x12,x3,x12,lsr#40
umlal v23.2d,v18.2s,v0.s[2]
add x13,x3,x13,lsr#40
umlal v22.2d,v18.2s,v8.s[2]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal v21.2d,v18.2s,v6.s[2]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal v20.2d,v18.2s,v4.s[2]
fmov d16,x8
umlal v19.2d,v18.2s,v2.s[2]
fmov d17,x10
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4 and accumulate
add v9.2s,v9.2s,v24.2s
fmov d18,x12
umlal v22.2d,v11.2s,v1.s[0]
ldp x8,x12,[x1],#16 // inp[0:1]
umlal v19.2d,v11.2s,v6.s[0]
ldp x9,x13,[x1],#48
umlal v23.2d,v11.2s,v3.s[0]
umlal v20.2d,v11.2s,v8.s[0]
umlal v21.2d,v11.2s,v0.s[0]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
add v10.2s,v10.2s,v25.2s
umlal v22.2d,v9.2s,v5.s[0]
umlal v23.2d,v9.2s,v7.s[0]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal v21.2d,v9.2s,v3.s[0]
and x5,x9,#0x03ffffff
umlal v19.2d,v9.2s,v0.s[0]
ubfx x6,x8,#26,#26
umlal v20.2d,v9.2s,v1.s[0]
ubfx x7,x9,#26,#26
add v12.2s,v12.2s,v27.2s
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal v22.2d,v10.2s,v3.s[0]
extr x8,x12,x8,#52
umlal v23.2d,v10.2s,v5.s[0]
extr x9,x13,x9,#52
umlal v19.2d,v10.2s,v8.s[0]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal v21.2d,v10.2s,v1.s[0]
fmov d9,x4
umlal v20.2d,v10.2s,v0.s[0]
and x8,x8,#0x03ffffff
add v13.2s,v13.2s,v28.2s
and x9,x9,#0x03ffffff
umlal v22.2d,v12.2s,v0.s[0]
ubfx x10,x12,#14,#26
umlal v19.2d,v12.2s,v4.s[0]
ubfx x11,x13,#14,#26
umlal v23.2d,v12.2s,v1.s[0]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal v20.2d,v12.2s,v6.s[0]
fmov d10,x6
umlal v21.2d,v12.2s,v8.s[0]
add x12,x3,x12,lsr#40
umlal v22.2d,v13.2s,v8.s[0]
add x13,x3,x13,lsr#40
umlal v19.2d,v13.2s,v2.s[0]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal v23.2d,v13.2s,v0.s[0]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal v20.2d,v13.2s,v4.s[0]
fmov d11,x8
umlal v21.2d,v13.2s,v6.s[0]
fmov d12,x10
fmov d13,x12
/////////////////////////////////////////////////////////////////
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
// and P. Schwabe
//
// [see discussion in poly1305-armv4 module]
ushr v29.2d,v22.2d,#26
xtn v27.2s,v22.2d
ushr v30.2d,v19.2d,#26
and v19.16b,v19.16b,v31.16b
add v23.2d,v23.2d,v29.2d // h3 -> h4
bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
add v20.2d,v20.2d,v30.2d // h0 -> h1
ushr v29.2d,v23.2d,#26
xtn v28.2s,v23.2d
ushr v30.2d,v20.2d,#26
xtn v25.2s,v20.2d
bic v28.2s,#0xfc,lsl#24
add v21.2d,v21.2d,v30.2d // h1 -> h2
add v19.2d,v19.2d,v29.2d
shl v29.2d,v29.2d,#2
shrn v30.2s,v21.2d,#26
xtn v26.2s,v21.2d
add v19.2d,v19.2d,v29.2d // h4 -> h0
bic v25.2s,#0xfc,lsl#24
add v27.2s,v27.2s,v30.2s // h2 -> h3
bic v26.2s,#0xfc,lsl#24
shrn v29.2s,v19.2d,#26
xtn v24.2s,v19.2d
ushr v30.2s,v27.2s,#26
bic v27.2s,#0xfc,lsl#24
bic v24.2s,#0xfc,lsl#24
add v25.2s,v25.2s,v29.2s // h0 -> h1
add v28.2s,v28.2s,v30.2s // h3 -> h4
b.hi .Loop_neon
.Lskip_loop:
dup v16.2d,v16.d[0]
add v11.2s,v11.2s,v26.2s
////////////////////////////////////////////////////////////////
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
adds x2,x2,#32
b.ne .Long_tail
dup v16.2d,v11.d[0]
add v14.2s,v9.2s,v24.2s
add v17.2s,v12.2s,v27.2s
add v15.2s,v10.2s,v25.2s
add v18.2s,v13.2s,v28.2s
.Long_tail:
dup v14.2d,v14.d[0]
umull2 v19.2d,v16.4s,v6.4s
umull2 v22.2d,v16.4s,v1.4s
umull2 v23.2d,v16.4s,v3.4s
umull2 v21.2d,v16.4s,v0.4s
umull2 v20.2d,v16.4s,v8.4s
dup v15.2d,v15.d[0]
umlal2 v19.2d,v14.4s,v0.4s
umlal2 v21.2d,v14.4s,v3.4s
umlal2 v22.2d,v14.4s,v5.4s
umlal2 v23.2d,v14.4s,v7.4s
umlal2 v20.2d,v14.4s,v1.4s
dup v17.2d,v17.d[0]
umlal2 v19.2d,v15.4s,v8.4s
umlal2 v22.2d,v15.4s,v3.4s
umlal2 v21.2d,v15.4s,v1.4s
umlal2 v23.2d,v15.4s,v5.4s
umlal2 v20.2d,v15.4s,v0.4s
dup v18.2d,v18.d[0]
umlal2 v22.2d,v17.4s,v0.4s
umlal2 v23.2d,v17.4s,v1.4s
umlal2 v19.2d,v17.4s,v4.4s
umlal2 v20.2d,v17.4s,v6.4s
umlal2 v21.2d,v17.4s,v8.4s
umlal2 v22.2d,v18.4s,v8.4s
umlal2 v19.2d,v18.4s,v2.4s
umlal2 v23.2d,v18.4s,v0.4s
umlal2 v20.2d,v18.4s,v4.4s
umlal2 v21.2d,v18.4s,v6.4s
b.eq .Lshort_tail
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4:r^3 and accumulate
add v9.2s,v9.2s,v24.2s
umlal v22.2d,v11.2s,v1.2s
umlal v19.2d,v11.2s,v6.2s
umlal v23.2d,v11.2s,v3.2s
umlal v20.2d,v11.2s,v8.2s
umlal v21.2d,v11.2s,v0.2s
add v10.2s,v10.2s,v25.2s
umlal v22.2d,v9.2s,v5.2s
umlal v19.2d,v9.2s,v0.2s
umlal v23.2d,v9.2s,v7.2s
umlal v20.2d,v9.2s,v1.2s
umlal v21.2d,v9.2s,v3.2s
add v12.2s,v12.2s,v27.2s
umlal v22.2d,v10.2s,v3.2s
umlal v19.2d,v10.2s,v8.2s
umlal v23.2d,v10.2s,v5.2s
umlal v20.2d,v10.2s,v0.2s
umlal v21.2d,v10.2s,v1.2s
add v13.2s,v13.2s,v28.2s
umlal v22.2d,v12.2s,v0.2s
umlal v19.2d,v12.2s,v4.2s
umlal v23.2d,v12.2s,v1.2s
umlal v20.2d,v12.2s,v6.2s
umlal v21.2d,v12.2s,v8.2s
umlal v22.2d,v13.2s,v8.2s
umlal v19.2d,v13.2s,v2.2s
umlal v23.2d,v13.2s,v0.2s
umlal v20.2d,v13.2s,v4.2s
umlal v21.2d,v13.2s,v6.2s
.Lshort_tail:
////////////////////////////////////////////////////////////////
// horizontal add
addp v22.2d,v22.2d,v22.2d
ldp d8,d9,[sp,#16] // meet ABI requirements
addp v19.2d,v19.2d,v19.2d
ldp d10,d11,[sp,#32]
addp v23.2d,v23.2d,v23.2d
ldp d12,d13,[sp,#48]
addp v20.2d,v20.2d,v20.2d
ldp d14,d15,[sp,#64]
addp v21.2d,v21.2d,v21.2d
ldr x30,[sp,#8]
////////////////////////////////////////////////////////////////
// lazy reduction, but without narrowing
ushr v29.2d,v22.2d,#26
and v22.16b,v22.16b,v31.16b
ushr v30.2d,v19.2d,#26
and v19.16b,v19.16b,v31.16b
add v23.2d,v23.2d,v29.2d // h3 -> h4
add v20.2d,v20.2d,v30.2d // h0 -> h1
ushr v29.2d,v23.2d,#26
and v23.16b,v23.16b,v31.16b
ushr v30.2d,v20.2d,#26
and v20.16b,v20.16b,v31.16b
add v21.2d,v21.2d,v30.2d // h1 -> h2
add v19.2d,v19.2d,v29.2d
shl v29.2d,v29.2d,#2
ushr v30.2d,v21.2d,#26
and v21.16b,v21.16b,v31.16b
add v19.2d,v19.2d,v29.2d // h4 -> h0
add v22.2d,v22.2d,v30.2d // h2 -> h3
ushr v29.2d,v19.2d,#26
and v19.16b,v19.16b,v31.16b
ushr v30.2d,v22.2d,#26
and v22.16b,v22.16b,v31.16b
add v20.2d,v20.2d,v29.2d // h0 -> h1
add v23.2d,v23.2d,v30.2d // h3 -> h4
////////////////////////////////////////////////////////////////
// write the result, can be partially reduced
st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
mov x4,#1
st1 {v23.s}[0],[x0]
str x4,[x0,#8] // set is_base2_26
ldr x29,[sp],#80
.inst 0xd50323bf // autiasp
ret
.size poly1305_blocks_neon,.-poly1305_blocks_neon
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
.align 2
#if !defined(__KERNEL__) && !defined(_WIN64)
.comm OPENSSL_armcap_P,4,4
.hidden OPENSSL_armcap_P
#endif

View File

@@ -0,0 +1,230 @@
// SPDX-License-Identifier: GPL-2.0
/*
* OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
*
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/algapi.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/poly1305.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/jump_label.h>
#include <linux/module.h>
asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
{
poly1305_init_arm64(&dctx->h, key);
dctx->s[0] = get_unaligned_le32(key + 16);
dctx->s[1] = get_unaligned_le32(key + 20);
dctx->s[2] = get_unaligned_le32(key + 24);
dctx->s[3] = get_unaligned_le32(key + 28);
dctx->buflen = 0;
}
EXPORT_SYMBOL(poly1305_init_arch);
static int neon_poly1305_init(struct shash_desc *desc)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
dctx->buflen = 0;
dctx->rset = 0;
dctx->sset = false;
return 0;
}
static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
u32 len, u32 hibit, bool do_neon)
{
if (unlikely(!dctx->sset)) {
if (!dctx->rset) {
poly1305_init_arch(dctx, src);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->rset = 1;
}
if (len >= POLY1305_BLOCK_SIZE) {
dctx->s[0] = get_unaligned_le32(src + 0);
dctx->s[1] = get_unaligned_le32(src + 4);
dctx->s[2] = get_unaligned_le32(src + 8);
dctx->s[3] = get_unaligned_le32(src + 12);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->sset = true;
}
if (len < POLY1305_BLOCK_SIZE)
return;
}
len &= ~(POLY1305_BLOCK_SIZE - 1);
if (static_branch_likely(&have_neon) && likely(do_neon))
poly1305_blocks_neon(&dctx->h, src, len, hibit);
else
poly1305_blocks(&dctx->h, src, len, hibit);
}
static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
const u8 *src, u32 len, bool do_neon)
{
if (unlikely(dctx->buflen)) {
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
memcpy(dctx->buf + dctx->buflen, src, bytes);
src += bytes;
len -= bytes;
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
neon_poly1305_blocks(dctx, dctx->buf,
POLY1305_BLOCK_SIZE, 1, false);
dctx->buflen = 0;
}
}
if (likely(len >= POLY1305_BLOCK_SIZE)) {
neon_poly1305_blocks(dctx, src, len, 1, do_neon);
src += round_down(len, POLY1305_BLOCK_SIZE);
len %= POLY1305_BLOCK_SIZE;
}
if (unlikely(len)) {
dctx->buflen = len;
memcpy(dctx->buf, src, len);
}
}
static int neon_poly1305_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
{
bool do_neon = may_use_simd() && srclen > 128;
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
if (static_branch_likely(&have_neon) && do_neon)
kernel_neon_begin();
neon_poly1305_do_update(dctx, src, srclen, do_neon);
if (static_branch_likely(&have_neon) && do_neon)
kernel_neon_end();
return 0;
}
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int nbytes)
{
if (unlikely(dctx->buflen)) {
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
memcpy(dctx->buf + dctx->buflen, src, bytes);
src += bytes;
nbytes -= bytes;
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
dctx->buflen = 0;
}
}
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
if (static_branch_likely(&have_neon) && may_use_simd()) {
do {
unsigned int todo = min_t(unsigned int, len, SZ_4K);
kernel_neon_begin();
poly1305_blocks_neon(&dctx->h, src, todo, 1);
kernel_neon_end();
len -= todo;
src += todo;
} while (len);
} else {
poly1305_blocks(&dctx->h, src, len, 1);
src += len;
}
nbytes %= POLY1305_BLOCK_SIZE;
}
if (unlikely(nbytes)) {
dctx->buflen = nbytes;
memcpy(dctx->buf, src, nbytes);
}
}
EXPORT_SYMBOL(poly1305_update_arch);
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
{
if (unlikely(dctx->buflen)) {
dctx->buf[dctx->buflen++] = 1;
memset(dctx->buf + dctx->buflen, 0,
POLY1305_BLOCK_SIZE - dctx->buflen);
poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
}
poly1305_emit(&dctx->h, dst, dctx->s);
*dctx = (struct poly1305_desc_ctx){};
}
EXPORT_SYMBOL(poly1305_final_arch);
static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
if (unlikely(!dctx->sset))
return -ENOKEY;
poly1305_final_arch(dctx, dst);
return 0;
}
static struct shash_alg neon_poly1305_alg = {
.init = neon_poly1305_init,
.update = neon_poly1305_update,
.final = neon_poly1305_final,
.digestsize = POLY1305_DIGEST_SIZE,
.descsize = sizeof(struct poly1305_desc_ctx),
.base.cra_name = "poly1305",
.base.cra_driver_name = "poly1305-neon",
.base.cra_priority = 200,
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
};
static int __init neon_poly1305_mod_init(void)
{
if (!(elf_hwcap & HWCAP_ASIMD))
return 0;
static_branch_enable(&have_neon);
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
crypto_register_shash(&neon_poly1305_alg) : 0;
}
static void __exit neon_poly1305_mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && (elf_hwcap & HWCAP_ASIMD))
crypto_unregister_shash(&neon_poly1305_alg);
}
module_init(neon_poly1305_mod_init);
module_exit(neon_poly1305_mod_exit);
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("poly1305");
MODULE_ALIAS_CRYPTO("poly1305-neon");

View File

@@ -22,16 +22,21 @@ endif
CC_COMPAT ?= $(CC)
CC_COMPAT += $(CC_COMPAT_CLANG_FLAGS)
ifeq ($(LLVM),1)
LD_COMPAT ?= $(LD)
else
LD_COMPAT ?= $(CROSS_COMPILE_COMPAT)ld
endif
else
CC_COMPAT ?= $(CROSS_COMPILE_COMPAT)gcc
LD_COMPAT ?= $(CROSS_COMPILE_COMPAT)ld
endif
cc32-option = $(call try-run,\
$(CC_COMPAT) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
cc32-disable-warning = $(call try-run,\
$(CC_COMPAT) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1)))
cc32-ldoption = $(call try-run,\
$(CC_COMPAT) $(1) -nostdlib -x c /dev/null -o "$$TMP",$(1),$(2))
cc32-as-instr = $(call try-run,\
printf "%b\n" "$(1)" | $(CC_COMPAT) $(VDSO_AFLAGS) -c -x assembler -o "$$TMP" -,$(2),$(3))
@@ -114,14 +119,10 @@ dmbinstr := $(call cc32-as-instr,dmb ishld,-DCONFIG_AS_DMB_ISHLD=1)
VDSO_CFLAGS += $(dmbinstr)
VDSO_AFLAGS += $(dmbinstr)
VDSO_LDFLAGS := $(VDSO_CPPFLAGS)
# From arm vDSO Makefile
VDSO_LDFLAGS += -Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1
VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
VDSO_LDFLAGS += -nostdlib -shared -mfloat-abi=soft
VDSO_LDFLAGS += $(call cc32-ldoption,-Wl$(comma)--hash-style=sysv)
VDSO_LDFLAGS += $(call cc32-ldoption,-Wl$(comma)--build-id)
VDSO_LDFLAGS += $(call cc32-ldoption,-fuse-ld=bfd)
VDSO_LDFLAGS += -Bsymbolic --no-undefined -soname=linux-vdso.so.1
VDSO_LDFLAGS += -z max-page-size=4096 -z common-page-size=4096
VDSO_LDFLAGS += -nostdlib -shared --hash-style=sysv --build-id
# Borrow vdsomunge.c from the arm vDSO
@@ -182,8 +183,8 @@ quiet_cmd_vdsold_and_vdso_check = LD32 $@
cmd_vdsold_and_vdso_check = $(cmd_vdsold); $(cmd_vdso_check)
quiet_cmd_vdsold = LD32 $@
cmd_vdsold = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_LDFLAGS) \
-Wl,-T $(filter %.lds,$^) $(filter %.o,$^) -o $@
cmd_vdsold = $(LD_COMPAT) $(VDSO_LDFLAGS) \
-T $(filter %.lds,$^) $(filter %.o,$^) -o $@
quiet_cmd_vdsocc = CC32 $@
cmd_vdsocc = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_CFLAGS) -c -o $@ $<
quiet_cmd_vdsocc_gettimeofday = CC32 $@

View File

@@ -499,7 +499,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg)
if (map_start < map_end)
memmap_init_zone((unsigned long)(map_end - map_start),
args->nid, args->zone, page_to_pfn(map_start),
MEMMAP_EARLY, NULL);
MEMINIT_EARLY, NULL);
return 0;
}
@@ -508,8 +508,8 @@ memmap_init (unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn)
{
if (!vmem_map) {
memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY,
NULL);
memmap_init_zone(size, nid, zone, start_pfn,
MEMINIT_EARLY, NULL);
} else {
struct page *start;
struct memmap_init_callback_data args;

View File

@@ -339,7 +339,7 @@ libs-y += arch/mips/math-emu/
# See arch/mips/Kbuild for content of core part of the kernel
core-y += arch/mips/
drivers-$(CONFIG_MIPS_CRC_SUPPORT) += arch/mips/crypto/
drivers-y += arch/mips/crypto/
drivers-$(CONFIG_OPROFILE) += arch/mips/oprofile/
# suspend and hibernation support

View File

@@ -4,3 +4,21 @@
#
obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o
obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
chacha-mips-y := chacha-core.o chacha-glue.o
AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
poly1305-mips-y := poly1305-core.o poly1305-glue.o
perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
$(call if_changed,perlasm)
targets += poly1305-core.S

View File

@@ -0,0 +1,497 @@
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
* Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#define MASK_U32 0x3c
#define CHACHA20_BLOCK_SIZE 64
#define STACK_SIZE 32
#define X0 $t0
#define X1 $t1
#define X2 $t2
#define X3 $t3
#define X4 $t4
#define X5 $t5
#define X6 $t6
#define X7 $t7
#define X8 $t8
#define X9 $t9
#define X10 $v1
#define X11 $s6
#define X12 $s5
#define X13 $s4
#define X14 $s3
#define X15 $s2
/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
#define T0 $s1
#define T1 $s0
#define T(n) T ## n
#define X(n) X ## n
/* Input arguments */
#define STATE $a0
#define OUT $a1
#define IN $a2
#define BYTES $a3
/* Output argument */
/* NONCE[0] is kept in a register and not in memory.
* We don't want to touch original value in memory.
* Must be incremented every loop iteration.
*/
#define NONCE_0 $v0
/* SAVED_X and SAVED_CA are set in the jump table.
* Use regs which are overwritten on exit else we don't leak clear data.
* They are used to handling the last bytes which are not multiple of 4.
*/
#define SAVED_X X15
#define SAVED_CA $s7
#define IS_UNALIGNED $s7
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define MSB 0
#define LSB 3
#define ROTx rotl
#define ROTR(n) rotr n, 24
#define CPU_TO_LE32(n) \
wsbh n; \
rotr n, 16;
#else
#define MSB 3
#define LSB 0
#define ROTx rotr
#define CPU_TO_LE32(n)
#define ROTR(n)
#endif
#define FOR_EACH_WORD(x) \
x( 0); \
x( 1); \
x( 2); \
x( 3); \
x( 4); \
x( 5); \
x( 6); \
x( 7); \
x( 8); \
x( 9); \
x(10); \
x(11); \
x(12); \
x(13); \
x(14); \
x(15);
#define FOR_EACH_WORD_REV(x) \
x(15); \
x(14); \
x(13); \
x(12); \
x(11); \
x(10); \
x( 9); \
x( 8); \
x( 7); \
x( 6); \
x( 5); \
x( 4); \
x( 3); \
x( 2); \
x( 1); \
x( 0);
#define PLUS_ONE_0 1
#define PLUS_ONE_1 2
#define PLUS_ONE_2 3
#define PLUS_ONE_3 4
#define PLUS_ONE_4 5
#define PLUS_ONE_5 6
#define PLUS_ONE_6 7
#define PLUS_ONE_7 8
#define PLUS_ONE_8 9
#define PLUS_ONE_9 10
#define PLUS_ONE_10 11
#define PLUS_ONE_11 12
#define PLUS_ONE_12 13
#define PLUS_ONE_13 14
#define PLUS_ONE_14 15
#define PLUS_ONE_15 16
#define PLUS_ONE(x) PLUS_ONE_ ## x
#define _CONCAT3(a,b,c) a ## b ## c
#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
#define STORE_UNALIGNED(x) \
CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
.if (x != 12); \
lw T0, (x*4)(STATE); \
.endif; \
lwl T1, (x*4)+MSB ## (IN); \
lwr T1, (x*4)+LSB ## (IN); \
.if (x == 12); \
addu X ## x, NONCE_0; \
.else; \
addu X ## x, T0; \
.endif; \
CPU_TO_LE32(X ## x); \
xor X ## x, T1; \
swl X ## x, (x*4)+MSB ## (OUT); \
swr X ## x, (x*4)+LSB ## (OUT);
#define STORE_ALIGNED(x) \
CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
.if (x != 12); \
lw T0, (x*4)(STATE); \
.endif; \
lw T1, (x*4) ## (IN); \
.if (x == 12); \
addu X ## x, NONCE_0; \
.else; \
addu X ## x, T0; \
.endif; \
CPU_TO_LE32(X ## x); \
xor X ## x, T1; \
sw X ## x, (x*4) ## (OUT);
/* Jump table macro.
* Used for setup and handling the last bytes, which are not multiple of 4.
* X15 is free to store Xn
* Every jumptable entry must be equal in size.
*/
#define JMPTBL_ALIGNED(x) \
.Lchacha_mips_jmptbl_aligned_ ## x: ; \
.set noreorder; \
b .Lchacha_mips_xor_aligned_ ## x ## _b; \
.if (x == 12); \
addu SAVED_X, X ## x, NONCE_0; \
.else; \
addu SAVED_X, X ## x, SAVED_CA; \
.endif; \
.set reorder
#define JMPTBL_UNALIGNED(x) \
.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
.set noreorder; \
b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
.if (x == 12); \
addu SAVED_X, X ## x, NONCE_0; \
.else; \
addu SAVED_X, X ## x, SAVED_CA; \
.endif; \
.set reorder
#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
addu X(A), X(K); \
addu X(B), X(L); \
addu X(C), X(M); \
addu X(D), X(N); \
xor X(V), X(A); \
xor X(W), X(B); \
xor X(Y), X(C); \
xor X(Z), X(D); \
rotl X(V), S; \
rotl X(W), S; \
rotl X(Y), S; \
rotl X(Z), S;
.text
.set reorder
.set noat
.globl chacha_crypt_arch
.ent chacha_crypt_arch
chacha_crypt_arch:
.frame $sp, STACK_SIZE, $ra
/* Load number of rounds */
lw $at, 16($sp)
addiu $sp, -STACK_SIZE
/* Return bytes = 0. */
beqz BYTES, .Lchacha_mips_end
lw NONCE_0, 48(STATE)
/* Save s0-s7 */
sw $s0, 0($sp)
sw $s1, 4($sp)
sw $s2, 8($sp)
sw $s3, 12($sp)
sw $s4, 16($sp)
sw $s5, 20($sp)
sw $s6, 24($sp)
sw $s7, 28($sp)
/* Test IN or OUT is unaligned.
* IS_UNALIGNED = ( IN | OUT ) & 0x00000003
*/
or IS_UNALIGNED, IN, OUT
andi IS_UNALIGNED, 0x3
b .Lchacha_rounds_start
.align 4
.Loop_chacha_rounds:
addiu IN, CHACHA20_BLOCK_SIZE
addiu OUT, CHACHA20_BLOCK_SIZE
addiu NONCE_0, 1
.Lchacha_rounds_start:
lw X0, 0(STATE)
lw X1, 4(STATE)
lw X2, 8(STATE)
lw X3, 12(STATE)
lw X4, 16(STATE)
lw X5, 20(STATE)
lw X6, 24(STATE)
lw X7, 28(STATE)
lw X8, 32(STATE)
lw X9, 36(STATE)
lw X10, 40(STATE)
lw X11, 44(STATE)
move X12, NONCE_0
lw X13, 52(STATE)
lw X14, 56(STATE)
lw X15, 60(STATE)
.Loop_chacha_xor_rounds:
addiu $at, -2
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
bnez $at, .Loop_chacha_xor_rounds
addiu BYTES, -(CHACHA20_BLOCK_SIZE)
/* Is data src/dst unaligned? Jump */
bnez IS_UNALIGNED, .Loop_chacha_unaligned
/* Set number rounds here to fill delayslot. */
lw $at, (STACK_SIZE+16)($sp)
/* BYTES < 0, it has no full block. */
bltz BYTES, .Lchacha_mips_no_full_block_aligned
FOR_EACH_WORD_REV(STORE_ALIGNED)
/* BYTES > 0? Loop again. */
bgtz BYTES, .Loop_chacha_rounds
/* Place this here to fill delay slot */
addiu NONCE_0, 1
/* BYTES < 0? Handle last bytes */
bltz BYTES, .Lchacha_mips_xor_bytes
.Lchacha_mips_xor_done:
/* Restore used registers */
lw $s0, 0($sp)
lw $s1, 4($sp)
lw $s2, 8($sp)
lw $s3, 12($sp)
lw $s4, 16($sp)
lw $s5, 20($sp)
lw $s6, 24($sp)
lw $s7, 28($sp)
/* Write NONCE_0 back to right location in state */
sw NONCE_0, 48(STATE)
.Lchacha_mips_end:
addiu $sp, STACK_SIZE
jr $ra
.Lchacha_mips_no_full_block_aligned:
/* Restore the offset on BYTES */
addiu BYTES, CHACHA20_BLOCK_SIZE
/* Get number of full WORDS */
andi $at, BYTES, MASK_U32
/* Load upper half of jump table addr */
lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
/* Calculate lower half jump table offset */
ins T0, $at, 1, 6
/* Add offset to STATE */
addu T1, STATE, $at
/* Add lower half jump table addr */
addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
/* Read value from STATE */
lw SAVED_CA, 0(T1)
/* Store remaining bytecounter as negative value */
subu BYTES, $at, BYTES
jr T0
/* Jump table */
FOR_EACH_WORD(JMPTBL_ALIGNED)
.Loop_chacha_unaligned:
/* Set number rounds here to fill delayslot. */
lw $at, (STACK_SIZE+16)($sp)
/* BYTES > 0, it has no full block. */
bltz BYTES, .Lchacha_mips_no_full_block_unaligned
FOR_EACH_WORD_REV(STORE_UNALIGNED)
/* BYTES > 0? Loop again. */
bgtz BYTES, .Loop_chacha_rounds
/* Write NONCE_0 back to right location in state */
sw NONCE_0, 48(STATE)
.set noreorder
/* Fall through to byte handling */
bgez BYTES, .Lchacha_mips_xor_done
.Lchacha_mips_xor_unaligned_0_b:
.Lchacha_mips_xor_aligned_0_b:
/* Place this here to fill delay slot */
addiu NONCE_0, 1
.set reorder
.Lchacha_mips_xor_bytes:
addu IN, $at
addu OUT, $at
/* First byte */
lbu T1, 0(IN)
addiu $at, BYTES, 1
CPU_TO_LE32(SAVED_X)
ROTR(SAVED_X)
xor T1, SAVED_X
sb T1, 0(OUT)
beqz $at, .Lchacha_mips_xor_done
/* Second byte */
lbu T1, 1(IN)
addiu $at, BYTES, 2
ROTx SAVED_X, 8
xor T1, SAVED_X
sb T1, 1(OUT)
beqz $at, .Lchacha_mips_xor_done
/* Third byte */
lbu T1, 2(IN)
ROTx SAVED_X, 8
xor T1, SAVED_X
sb T1, 2(OUT)
b .Lchacha_mips_xor_done
.Lchacha_mips_no_full_block_unaligned:
/* Restore the offset on BYTES */
addiu BYTES, CHACHA20_BLOCK_SIZE
/* Get number of full WORDS */
andi $at, BYTES, MASK_U32
/* Load upper half of jump table addr */
lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
/* Calculate lower half jump table offset */
ins T0, $at, 1, 6
/* Add offset to STATE */
addu T1, STATE, $at
/* Add lower half jump table addr */
addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
/* Read value from STATE */
lw SAVED_CA, 0(T1)
/* Store remaining bytecounter as negative value */
subu BYTES, $at, BYTES
jr T0
/* Jump table */
FOR_EACH_WORD(JMPTBL_UNALIGNED)
.end chacha_crypt_arch
.set at
/* Input arguments
* STATE $a0
* OUT $a1
* NROUND $a2
*/
#undef X12
#undef X13
#undef X14
#undef X15
#define X12 $a3
#define X13 $at
#define X14 $v0
#define X15 STATE
.set noat
.globl hchacha_block_arch
.ent hchacha_block_arch
hchacha_block_arch:
.frame $sp, STACK_SIZE, $ra
addiu $sp, -STACK_SIZE
/* Save X11(s6) */
sw X11, 0($sp)
lw X0, 0(STATE)
lw X1, 4(STATE)
lw X2, 8(STATE)
lw X3, 12(STATE)
lw X4, 16(STATE)
lw X5, 20(STATE)
lw X6, 24(STATE)
lw X7, 28(STATE)
lw X8, 32(STATE)
lw X9, 36(STATE)
lw X10, 40(STATE)
lw X11, 44(STATE)
lw X12, 48(STATE)
lw X13, 52(STATE)
lw X14, 56(STATE)
lw X15, 60(STATE)
.Loop_hchacha_xor_rounds:
addiu $a2, -2
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
bnez $a2, .Loop_hchacha_xor_rounds
/* Restore used register */
lw X11, 0($sp)
sw X0, 0(OUT)
sw X1, 4(OUT)
sw X2, 8(OUT)
sw X3, 12(OUT)
sw X12, 16(OUT)
sw X13, 20(OUT)
sw X14, 24(OUT)
sw X15, 28(OUT)
addiu $sp, STACK_SIZE
jr $ra
.end hchacha_block_arch
.set at

View File

@@ -0,0 +1,152 @@
// SPDX-License-Identifier: GPL-2.0
/*
* MIPS accelerated ChaCha and XChaCha stream ciphers,
* including ChaCha20 (RFC7539)
*
* Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*/
#include <asm/byteorder.h>
#include <crypto/algapi.h>
#include <crypto/internal/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
#include <linux/module.h>
asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes, int nrounds);
EXPORT_SYMBOL(chacha_crypt_arch);
asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds);
EXPORT_SYMBOL(hchacha_block_arch);
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
{
chacha_init_generic(state, key, iv);
}
EXPORT_SYMBOL(chacha_init_arch);
static int chacha_mips_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv)
{
struct skcipher_walk walk;
u32 state[16];
int err;
err = skcipher_walk_virt(&walk, req, false);
chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr,
nbytes, ctx->nrounds);
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
static int chacha_mips(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
return chacha_mips_stream_xor(req, ctx, req->iv);
}
static int xchacha_mips(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct chacha_ctx subctx;
u32 state[16];
u8 real_iv[16];
chacha_init_generic(state, ctx->key, req->iv);
hchacha_block(state, subctx.key, ctx->nrounds);
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
memcpy(&real_iv[8], req->iv + 16, 8);
return chacha_mips_stream_xor(req, &subctx, real_iv);
}
static struct skcipher_alg algs[] = {
{
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-mips",
.base.cra_priority = 200,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = chacha_mips,
.decrypt = chacha_mips,
}, {
.base.cra_name = "xchacha20",
.base.cra_driver_name = "xchacha20-mips",
.base.cra_priority = 200,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = xchacha_mips,
.decrypt = xchacha_mips,
}, {
.base.cra_name = "xchacha12",
.base.cra_driver_name = "xchacha12-mips",
.base.cra_priority = 200,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha12_setkey,
.encrypt = xchacha_mips,
.decrypt = xchacha_mips,
}
};
static int __init chacha_simd_mod_init(void)
{
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
}
static void __exit chacha_simd_mod_fini(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER))
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
}
module_init(chacha_simd_mod_init);
module_exit(chacha_simd_mod_fini);
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("chacha20");
MODULE_ALIAS_CRYPTO("chacha20-mips");
MODULE_ALIAS_CRYPTO("xchacha20");
MODULE_ALIAS_CRYPTO("xchacha20-mips");
MODULE_ALIAS_CRYPTO("xchacha12");
MODULE_ALIAS_CRYPTO("xchacha12-mips");

View File

@@ -0,0 +1,191 @@
// SPDX-License-Identifier: GPL-2.0
/*
* OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
*
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
*/
#include <asm/unaligned.h>
#include <crypto/algapi.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/poly1305.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/module.h>
asmlinkage void poly1305_init_mips(void *state, const u8 *key);
asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce);
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
{
poly1305_init_mips(&dctx->h, key);
dctx->s[0] = get_unaligned_le32(key + 16);
dctx->s[1] = get_unaligned_le32(key + 20);
dctx->s[2] = get_unaligned_le32(key + 24);
dctx->s[3] = get_unaligned_le32(key + 28);
dctx->buflen = 0;
}
EXPORT_SYMBOL(poly1305_init_arch);
static int mips_poly1305_init(struct shash_desc *desc)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
dctx->buflen = 0;
dctx->rset = 0;
dctx->sset = false;
return 0;
}
static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
u32 len, u32 hibit)
{
if (unlikely(!dctx->sset)) {
if (!dctx->rset) {
poly1305_init_mips(&dctx->h, src);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->rset = 1;
}
if (len >= POLY1305_BLOCK_SIZE) {
dctx->s[0] = get_unaligned_le32(src + 0);
dctx->s[1] = get_unaligned_le32(src + 4);
dctx->s[2] = get_unaligned_le32(src + 8);
dctx->s[3] = get_unaligned_le32(src + 12);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->sset = true;
}
if (len < POLY1305_BLOCK_SIZE)
return;
}
len &= ~(POLY1305_BLOCK_SIZE - 1);
poly1305_blocks_mips(&dctx->h, src, len, hibit);
}
static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
unsigned int len)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
if (unlikely(dctx->buflen)) {
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
memcpy(dctx->buf + dctx->buflen, src, bytes);
src += bytes;
len -= bytes;
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
dctx->buflen = 0;
}
}
if (likely(len >= POLY1305_BLOCK_SIZE)) {
mips_poly1305_blocks(dctx, src, len, 1);
src += round_down(len, POLY1305_BLOCK_SIZE);
len %= POLY1305_BLOCK_SIZE;
}
if (unlikely(len)) {
dctx->buflen = len;
memcpy(dctx->buf, src, len);
}
return 0;
}
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int nbytes)
{
if (unlikely(dctx->buflen)) {
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
memcpy(dctx->buf + dctx->buflen, src, bytes);
src += bytes;
nbytes -= bytes;
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
poly1305_blocks_mips(&dctx->h, dctx->buf,
POLY1305_BLOCK_SIZE, 1);
dctx->buflen = 0;
}
}
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
poly1305_blocks_mips(&dctx->h, src, len, 1);
src += len;
nbytes %= POLY1305_BLOCK_SIZE;
}
if (unlikely(nbytes)) {
dctx->buflen = nbytes;
memcpy(dctx->buf, src, nbytes);
}
}
EXPORT_SYMBOL(poly1305_update_arch);
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
{
if (unlikely(dctx->buflen)) {
dctx->buf[dctx->buflen++] = 1;
memset(dctx->buf + dctx->buflen, 0,
POLY1305_BLOCK_SIZE - dctx->buflen);
poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
}
poly1305_emit_mips(&dctx->h, dst, dctx->s);
*dctx = (struct poly1305_desc_ctx){};
}
EXPORT_SYMBOL(poly1305_final_arch);
static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
if (unlikely(!dctx->sset))
return -ENOKEY;
poly1305_final_arch(dctx, dst);
return 0;
}
static struct shash_alg mips_poly1305_alg = {
.init = mips_poly1305_init,
.update = mips_poly1305_update,
.final = mips_poly1305_final,
.digestsize = POLY1305_DIGEST_SIZE,
.descsize = sizeof(struct poly1305_desc_ctx),
.base.cra_name = "poly1305",
.base.cra_driver_name = "poly1305-mips",
.base.cra_priority = 200,
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
};
static int __init mips_poly1305_mod_init(void)
{
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
crypto_register_shash(&mips_poly1305_alg) : 0;
}
static void __exit mips_poly1305_mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
crypto_unregister_shash(&mips_poly1305_alg);
}
module_init(mips_poly1305_mod_init);
module_exit(mips_poly1305_mod_exit);
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("poly1305");
MODULE_ALIAS_CRYPTO("poly1305-mips");

File diff suppressed because it is too large Load Diff

View File

@@ -12,6 +12,8 @@
#ifndef _ASM_POWERPC_LMB_H
#define _ASM_POWERPC_LMB_H
#include <linux/sched.h>
struct drmem_lmb {
u64 base_addr;
u32 drc_index;
@@ -27,8 +29,22 @@ struct drmem_lmb_info {
extern struct drmem_lmb_info *drmem_info;
static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb,
const struct drmem_lmb *start)
{
/*
* DLPAR code paths can take several milliseconds per element
* when interacting with firmware. Ensure that we don't
* unfairly monopolize the CPU.
*/
if (((++lmb - start) % 16) == 0)
cond_resched();
return lmb;
}
#define for_each_drmem_lmb_in_range(lmb, start, end) \
for ((lmb) = (start); (lmb) < (end); (lmb)++)
for ((lmb) = (start); (lmb) < (end); lmb = drmem_lmb_next(lmb, start))
#define for_each_drmem_lmb(lmb) \
for_each_drmem_lmb_in_range((lmb), \

View File

@@ -788,7 +788,7 @@
#define THRM1_TIN (1 << 31)
#define THRM1_TIV (1 << 30)
#define THRM1_THRES(x) ((x&0x7f)<<23)
#define THRM3_SITV(x) ((x&0x3fff)<<1)
#define THRM3_SITV(x) ((x & 0x1fff) << 1)
#define THRM1_TID (1<<2)
#define THRM1_TIE (1<<1)
#define THRM1_V (1<<0)

View File

@@ -76,19 +76,6 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
return false;
return cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm));
}
static inline void mm_reset_thread_local(struct mm_struct *mm)
{
WARN_ON(atomic_read(&mm->context.copros) > 0);
/*
* It's possible for mm_access to take a reference on mm_users to
* access the remote mm from another thread, but it's not allowed
* to set mm_cpumask, so mm_users may be > 1 here.
*/
WARN_ON(current->mm != mm);
atomic_set(&mm->context.active_cpus, 1);
cpumask_clear(mm_cpumask(mm));
cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
}
#else /* CONFIG_PPC_BOOK3S_64 */
static inline int mm_is_thread_local(struct mm_struct *mm)
{

View File

@@ -13,13 +13,14 @@
*/
#include <linux/errno.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/param.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/workqueue.h>
#include <asm/io.h>
#include <asm/reg.h>
@@ -39,9 +40,7 @@ static struct tau_temp
unsigned char grew;
} tau[NR_CPUS];
struct timer_list tau_timer;
#undef DEBUG
static bool tau_int_enable;
/* TODO: put these in a /proc interface, with some sanity checks, and maybe
* dynamic adjustment to minimize # of interrupts */
@@ -50,72 +49,49 @@ struct timer_list tau_timer;
#define step_size 2 /* step size when temp goes out of range */
#define window_expand 1 /* expand the window by this much */
/* configurable values for shrinking the window */
#define shrink_timer 2*HZ /* period between shrinking the window */
#define shrink_timer 2000 /* period between shrinking the window */
#define min_window 2 /* minimum window size, degrees C */
static void set_thresholds(unsigned long cpu)
{
#ifdef CONFIG_TAU_INT
/*
* setup THRM1,
* threshold, valid bit, enable interrupts, interrupt when below threshold
*/
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TIE | THRM1_TID);
u32 maybe_tie = tau_int_enable ? THRM1_TIE : 0;
/* setup THRM2,
* threshold, valid bit, enable interrupts, interrupt when above threshold
*/
mtspr (SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | THRM1_TIE);
#else
/* same thing but don't enable interrupts */
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TID);
mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V);
#endif
/* setup THRM1, threshold, valid bit, interrupt when below threshold */
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | maybe_tie | THRM1_TID);
/* setup THRM2, threshold, valid bit, interrupt when above threshold */
mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | maybe_tie);
}
static void TAUupdate(int cpu)
{
unsigned thrm;
#ifdef DEBUG
printk("TAUupdate ");
#endif
u32 thrm;
u32 bits = THRM1_TIV | THRM1_TIN | THRM1_V;
/* if both thresholds are crossed, the step_sizes cancel out
* and the window winds up getting expanded twice. */
if((thrm = mfspr(SPRN_THRM1)) & THRM1_TIV){ /* is valid? */
if(thrm & THRM1_TIN){ /* crossed low threshold */
if (tau[cpu].low >= step_size){
tau[cpu].low -= step_size;
tau[cpu].high -= (step_size - window_expand);
}
tau[cpu].grew = 1;
#ifdef DEBUG
printk("low threshold crossed ");
#endif
thrm = mfspr(SPRN_THRM1);
if ((thrm & bits) == bits) {
mtspr(SPRN_THRM1, 0);
if (tau[cpu].low >= step_size) {
tau[cpu].low -= step_size;
tau[cpu].high -= (step_size - window_expand);
}
tau[cpu].grew = 1;
pr_debug("%s: low threshold crossed\n", __func__);
}
if((thrm = mfspr(SPRN_THRM2)) & THRM1_TIV){ /* is valid? */
if(thrm & THRM1_TIN){ /* crossed high threshold */
if (tau[cpu].high <= 127-step_size){
tau[cpu].low += (step_size - window_expand);
tau[cpu].high += step_size;
}
tau[cpu].grew = 1;
#ifdef DEBUG
printk("high threshold crossed ");
#endif
thrm = mfspr(SPRN_THRM2);
if ((thrm & bits) == bits) {
mtspr(SPRN_THRM2, 0);
if (tau[cpu].high <= 127 - step_size) {
tau[cpu].low += (step_size - window_expand);
tau[cpu].high += step_size;
}
tau[cpu].grew = 1;
pr_debug("%s: high threshold crossed\n", __func__);
}
#ifdef DEBUG
printk("grew = %d\n", tau[cpu].grew);
#endif
#ifndef CONFIG_TAU_INT /* tau_timeout will do this if not using interrupts */
set_thresholds(cpu);
#endif
}
#ifdef CONFIG_TAU_INT
@@ -140,17 +116,16 @@ void TAUException(struct pt_regs * regs)
static void tau_timeout(void * info)
{
int cpu;
unsigned long flags;
int size;
int shrink;
/* disabling interrupts *should* be okay */
local_irq_save(flags);
cpu = smp_processor_id();
#ifndef CONFIG_TAU_INT
TAUupdate(cpu);
#endif
if (!tau_int_enable)
TAUupdate(cpu);
/* Stop thermal sensor comparisons and interrupts */
mtspr(SPRN_THRM3, 0);
size = tau[cpu].high - tau[cpu].low;
if (size > min_window && ! tau[cpu].grew) {
@@ -173,32 +148,26 @@ static void tau_timeout(void * info)
set_thresholds(cpu);
/*
* Do the enable every time, since otherwise a bunch of (relatively)
* complex sleep code needs to be added. One mtspr every time
* tau_timeout is called is probably not a big deal.
*
* Enable thermal sensor and set up sample interval timer
* need 20 us to do the compare.. until a nice 'cpu_speed' function
* call is implemented, just assume a 500 mhz clock. It doesn't really
* matter if we take too long for a compare since it's all interrupt
* driven anyway.
*
* use a extra long time.. (60 us @ 500 mhz)
/* Restart thermal sensor comparisons and interrupts.
* The "PowerPC 740 and PowerPC 750 Microprocessor Datasheet"
* recommends that "the maximum value be set in THRM3 under all
* conditions."
*/
mtspr(SPRN_THRM3, THRM3_SITV(500*60) | THRM3_E);
local_irq_restore(flags);
mtspr(SPRN_THRM3, THRM3_SITV(0x1fff) | THRM3_E);
}
static void tau_timeout_smp(struct timer_list *unused)
static struct workqueue_struct *tau_workq;
static void tau_work_func(struct work_struct *work)
{
/* schedule ourselves to be run again */
mod_timer(&tau_timer, jiffies + shrink_timer) ;
msleep(shrink_timer);
on_each_cpu(tau_timeout, NULL, 0);
/* schedule ourselves to be run again */
queue_work(tau_workq, work);
}
DECLARE_WORK(tau_work, tau_work_func);
/*
* setup the TAU
*
@@ -231,21 +200,19 @@ static int __init TAU_init(void)
return 1;
}
tau_int_enable = IS_ENABLED(CONFIG_TAU_INT) &&
!strcmp(cur_cpu_spec->platform, "ppc750");
/* first, set up the window shrinking timer */
timer_setup(&tau_timer, tau_timeout_smp, 0);
tau_timer.expires = jiffies + shrink_timer;
add_timer(&tau_timer);
tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1);
if (!tau_workq)
return -ENOMEM;
on_each_cpu(TAU_init_smp, NULL, 0);
printk("Thermal assist unit ");
#ifdef CONFIG_TAU_INT
printk("using interrupts, ");
#else
printk("using timers, ");
#endif
printk("shrink_timer: %d jiffies\n", shrink_timer);
queue_work(tau_workq, &tau_work);
pr_info("Thermal assist unit using %s, shrink_timer: %d ms\n",
tau_int_enable ? "interrupts" : "workqueue", shrink_timer);
tau_initialized = 1;
return 0;

View File

@@ -598,19 +598,29 @@ static void do_exit_flush_lazy_tlb(void *arg)
struct mm_struct *mm = arg;
unsigned long pid = mm->context.id;
/*
* A kthread could have done a mmget_not_zero() after the flushing CPU
* checked mm_is_singlethreaded, and be in the process of
* kthread_use_mm when interrupted here. In that case, current->mm will
* be set to mm, because kthread_use_mm() setting ->mm and switching to
* the mm is done with interrupts off.
*/
if (current->mm == mm)
return; /* Local CPU */
goto out_flush;
if (current->active_mm == mm) {
/*
* Must be a kernel thread because sender is single-threaded.
*/
BUG_ON(current->mm);
WARN_ON_ONCE(current->mm != NULL);
/* Is a kernel thread and is using mm as the lazy tlb */
mmgrab(&init_mm);
switch_mm(mm, &init_mm, current);
current->active_mm = &init_mm;
switch_mm_irqs_off(mm, &init_mm, current);
mmdrop(mm);
}
atomic_dec(&mm->context.active_cpus);
cpumask_clear_cpu(smp_processor_id(), mm_cpumask(mm));
out_flush:
_tlbiel_pid(pid, RIC_FLUSH_ALL);
}
@@ -625,7 +635,6 @@ static void exit_flush_lazy_tlbs(struct mm_struct *mm)
*/
smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
(void *)mm, 1);
mm_reset_thread_local(mm);
}
void radix__flush_tlb_mm(struct mm_struct *mm)

View File

@@ -95,7 +95,7 @@ REQUEST(__field(0, 8, partition_id)
#define REQUEST_NAME system_performance_capabilities
#define REQUEST_NUM 0x40
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
#include I(REQUEST_BEGIN)
REQUEST(__field(0, 1, perf_collect_privileged)
__field(0x1, 1, capability_mask)
@@ -223,7 +223,7 @@ REQUEST(__field(0, 2, partition_id)
#define REQUEST_NAME system_hypervisor_times
#define REQUEST_NUM 0xF0
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
#include I(REQUEST_BEGIN)
REQUEST(__count(0, 8, time_spent_to_dispatch_virtual_processors)
__count(0x8, 8, time_spent_processing_virtual_processor_timers)
@@ -234,7 +234,7 @@ REQUEST(__count(0, 8, time_spent_to_dispatch_virtual_processors)
#define REQUEST_NAME system_tlbie_count_and_time
#define REQUEST_NUM 0xF4
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
#include I(REQUEST_BEGIN)
REQUEST(__count(0, 8, tlbie_instructions_issued)
/*

View File

@@ -273,6 +273,15 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
mask |= CNST_PMC_MASK(pmc);
value |= CNST_PMC_VAL(pmc);
/*
* PMC5 and PMC6 are used to count cycles and instructions and
* they do not support most of the constraint bits. Add a check
* to exclude PMC5/6 from most of the constraints except for
* EBB/BHRB.
*/
if (pmc >= 5)
goto ebb_bhrb;
}
if (pmc <= 4) {
@@ -331,6 +340,7 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
}
}
ebb_bhrb:
if (!pmc && ebb)
/* EBB events must specify the PMC */
return -1;

View File

@@ -238,12 +238,11 @@ config TAU
temperature within 2-4 degrees Celsius. This option shows the current
on-die temperature in /proc/cpuinfo if the cpu supports it.
Unfortunately, on some chip revisions, this sensor is very inaccurate
and in many cases, does not work at all, so don't assume the cpu
temp is actually what /proc/cpuinfo says it is.
Unfortunately, this sensor is very inaccurate when uncalibrated, so
don't assume the cpu temp is actually what /proc/cpuinfo says it is.
config TAU_INT
bool "Interrupt driven TAU driver (DANGEROUS)"
bool "Interrupt driven TAU driver (EXPERIMENTAL)"
depends on TAU
---help---
The TAU supports an interrupt driven mode which causes an interrupt
@@ -251,12 +250,7 @@ config TAU_INT
to get notified the temp has exceeded a range. With this option off,
a timer is used to re-check the temperature periodically.
However, on some cpus it appears that the TAU interrupt hardware
is buggy and can cause a situation which would lead unexplained hard
lockups.
Unless you are extending the TAU driver, or enjoy kernel/hardware
debugging, leave this option off.
If in doubt, say N here.
config TAU_AVERAGE
bool "Average high and low temp"

View File

@@ -322,15 +322,14 @@ static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
return count;
}
static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
uint32_t type)
static void create_dump_obj(uint32_t id, size_t size, uint32_t type)
{
struct dump_obj *dump;
int rc;
dump = kzalloc(sizeof(*dump), GFP_KERNEL);
if (!dump)
return NULL;
return;
dump->kobj.kset = dump_kset;
@@ -350,21 +349,39 @@ static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
if (rc) {
kobject_put(&dump->kobj);
return NULL;
return;
}
/*
* As soon as the sysfs file for this dump is created/activated there is
* a chance the opal_errd daemon (or any userspace) might read and
* acknowledge the dump before kobject_uevent() is called. If that
* happens then there is a potential race between
* dump_ack_store->kobject_put() and kobject_uevent() which leads to a
* use-after-free of a kernfs object resulting in a kernel crash.
*
* To avoid that, we need to take a reference on behalf of the bin file,
* so that our reference remains valid while we call kobject_uevent().
* We then drop our reference before exiting the function, leaving the
* bin file to drop the last reference (if it hasn't already).
*/
/* Take a reference for the bin file */
kobject_get(&dump->kobj);
rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
if (rc) {
if (rc == 0) {
kobject_uevent(&dump->kobj, KOBJ_ADD);
pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
__func__, dump->id, dump->size);
} else {
/* Drop reference count taken for bin file */
kobject_put(&dump->kobj);
return NULL;
}
pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
__func__, dump->id, dump->size);
kobject_uevent(&dump->kobj, KOBJ_ADD);
return dump;
/* Drop our reference */
kobject_put(&dump->kobj);
return;
}
static irqreturn_t process_dump(int irq, void *data)

View File

@@ -40,6 +40,7 @@ static __init int rng_init(void)
ppc_md.get_random_seed = pseries_get_random_long;
of_node_put(dn);
return 0;
}
machine_subsys_initcall(pseries, rng_init);

View File

@@ -179,6 +179,7 @@ int icp_hv_init(void)
icp_ops = &icp_hv_ops;
of_node_put(np);
return 0;
}

View File

@@ -200,9 +200,10 @@ avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
adx_instr := $(call as-instr,adox %r10$(comma)%r10,-DCONFIG_AS_ADX=1)
KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)

View File

@@ -40,6 +40,7 @@ CONFIG_EMBEDDED=y
# CONFIG_SLAB_MERGE_DEFAULT is not set
CONFIG_PROFILING=y
CONFIG_SMP=y
CONFIG_X86_X2APIC=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
CONFIG_NR_CPUS=32
@@ -213,6 +214,7 @@ CONFIG_DM_VERITY_FEC=y
CONFIG_DM_BOW=y
CONFIG_NETDEVICES=y
CONFIG_DUMMY=y
CONFIG_WIREGUARD=y
CONFIG_TUN=y
CONFIG_VETH=y
# CONFIG_ETHERNET is not set
@@ -310,6 +312,7 @@ CONFIG_HID_NINTENDO=y
CONFIG_HID_SONY=y
CONFIG_HID_STEAM=y
CONFIG_USB_HIDDEV=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_XHCI_HCD=y
CONFIG_USB_GADGET=y
CONFIG_USB_GADGET_VBUS_DRAW=500
@@ -435,6 +438,7 @@ CONFIG_CRC8=y
CONFIG_XZ_DEC=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_DWARF4=y
# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
CONFIG_MAGIC_SYSRQ=y

1
arch/x86/crypto/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
poly1305-x86_64-cryptogams.S

View File

@@ -8,8 +8,10 @@ OBJECT_FILES_NON_STANDARD := y
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no)
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
@@ -23,7 +25,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -46,6 +48,11 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
# These modules require the assembler to support ADX.
ifeq ($(adx_supported),yes)
obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
endif
# These modules require assembler to support AVX.
ifeq ($(avx_supported),yes)
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
@@ -54,6 +61,7 @@ ifeq ($(avx_supported),yes)
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
endif
# These modules require assembler to support AVX2.
@@ -77,7 +85,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
@@ -87,6 +95,12 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
targets += poly1305-x86_64-cryptogams.S
endif
ifeq ($(avx_supported),yes)
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
camellia_aesni_avx_glue.o
@@ -100,20 +114,22 @@ endif
ifeq ($(avx2_supported),yes)
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
chacha20-x86_64-y += chacha20-avx2-x86_64.o
chacha-x86_64-y += chacha-avx2-x86_64.o
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
endif
ifeq ($(avx512_supported),yes)
chacha-x86_64-y += chacha-avx512vl-x86_64.o
endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
ifeq ($(avx2_supported),yes)
sha1-ssse3-y += sha1_avx2_x86_64_asm.o
poly1305-x86_64-y += poly1305-avx2-x86_64.o
endif
ifeq ($(sha1_ni_supported),yes)
sha1-ssse3-y += sha1_ni_asm.o
@@ -127,3 +143,8 @@ sha256-ssse3-y += sha256_ni_asm.o
endif
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $< > $@
$(obj)/%.S: $(src)/%.pl FORCE
$(call if_changed,perlasm)

View File

@@ -0,0 +1,258 @@
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
*/
#include <linux/linkage.h>
.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
.align 32
IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
.section .rodata.cst16.ROT16, "aM", @progbits, 16
.align 16
ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
.section .rodata.cst16.ROR328, "aM", @progbits, 16
.align 16
ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
.align 64
SIGMA:
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
#ifdef CONFIG_AS_AVX512
.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
.align 64
SIGMA2:
.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
#endif /* CONFIG_AS_AVX512 */
.text
#ifdef CONFIG_AS_SSSE3
ENTRY(blake2s_compress_ssse3)
testq %rdx,%rdx
je .Lendofloop
movdqu (%rdi),%xmm0
movdqu 0x10(%rdi),%xmm1
movdqa ROT16(%rip),%xmm12
movdqa ROR328(%rip),%xmm13
movdqu 0x20(%rdi),%xmm14
movq %rcx,%xmm15
leaq SIGMA+0xa0(%rip),%r8
jmp .Lbeginofloop
.align 32
.Lbeginofloop:
movdqa %xmm0,%xmm10
movdqa %xmm1,%xmm11
paddq %xmm15,%xmm14
movdqa IV(%rip),%xmm2
movdqa %xmm14,%xmm3
pxor IV+0x10(%rip),%xmm3
leaq SIGMA(%rip),%rcx
.Lroundloop:
movzbl (%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0x1(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0x2(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x3(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
punpckldq %xmm5,%xmm4
punpckldq %xmm7,%xmm6
punpcklqdq %xmm6,%xmm4
paddd %xmm4,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm12,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0xc,%xmm1
pslld $0x14,%xmm8
por %xmm8,%xmm1
movzbl 0x4(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0x5(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x6(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0x7(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
punpckldq %xmm6,%xmm5
punpckldq %xmm4,%xmm7
punpcklqdq %xmm7,%xmm5
paddd %xmm5,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm13,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0x7,%xmm1
pslld $0x19,%xmm8
por %xmm8,%xmm1
pshufd $0x93,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x39,%xmm2,%xmm2
movzbl 0x8(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x9(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0xa(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0xb(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
punpckldq %xmm7,%xmm6
punpckldq %xmm5,%xmm4
punpcklqdq %xmm4,%xmm6
paddd %xmm6,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm12,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0xc,%xmm1
pslld $0x14,%xmm8
por %xmm8,%xmm1
movzbl 0xc(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0xd(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0xe(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0xf(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
punpckldq %xmm4,%xmm7
punpckldq %xmm6,%xmm5
punpcklqdq %xmm5,%xmm7
paddd %xmm7,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm13,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0x7,%xmm1
pslld $0x19,%xmm8
por %xmm8,%xmm1
pshufd $0x39,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x93,%xmm2,%xmm2
addq $0x10,%rcx
cmpq %r8,%rcx
jnz .Lroundloop
pxor %xmm2,%xmm0
pxor %xmm3,%xmm1
pxor %xmm10,%xmm0
pxor %xmm11,%xmm1
addq $0x40,%rsi
decq %rdx
jnz .Lbeginofloop
movdqu %xmm0,(%rdi)
movdqu %xmm1,0x10(%rdi)
movdqu %xmm14,0x20(%rdi)
.Lendofloop:
ret
ENDPROC(blake2s_compress_ssse3)
#endif /* CONFIG_AS_SSSE3 */
#ifdef CONFIG_AS_AVX512
ENTRY(blake2s_compress_avx512)
vmovdqu (%rdi),%xmm0
vmovdqu 0x10(%rdi),%xmm1
vmovdqu 0x20(%rdi),%xmm4
vmovq %rcx,%xmm5
vmovdqa IV(%rip),%xmm14
vmovdqa IV+16(%rip),%xmm15
jmp .Lblake2s_compress_avx512_mainloop
.align 32
.Lblake2s_compress_avx512_mainloop:
vmovdqa %xmm0,%xmm10
vmovdqa %xmm1,%xmm11
vpaddq %xmm5,%xmm4,%xmm4
vmovdqa %xmm14,%xmm2
vpxor %xmm15,%xmm4,%xmm3
vmovdqu (%rsi),%ymm6
vmovdqu 0x20(%rsi),%ymm7
addq $0x40,%rsi
leaq SIGMA2(%rip),%rax
movb $0xa,%cl
.Lblake2s_compress_avx512_roundloop:
addq $0x40,%rax
vmovdqa -0x40(%rax),%ymm8
vmovdqa -0x20(%rax),%ymm9
vpermi2d %ymm7,%ymm6,%ymm8
vpermi2d %ymm7,%ymm6,%ymm9
vmovdqa %ymm8,%ymm6
vmovdqa %ymm9,%ymm7
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x10,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0xc,%xmm1,%xmm1
vextracti128 $0x1,%ymm8,%xmm8
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
vpshufd $0x93,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x39,%xmm2,%xmm2
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x10,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0xc,%xmm1,%xmm1
vextracti128 $0x1,%ymm9,%xmm9
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
vpshufd $0x39,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x93,%xmm2,%xmm2
decb %cl
jne .Lblake2s_compress_avx512_roundloop
vpxor %xmm10,%xmm0,%xmm0
vpxor %xmm11,%xmm1,%xmm1
vpxor %xmm2,%xmm0,%xmm0
vpxor %xmm3,%xmm1,%xmm1
decq %rdx
jne .Lblake2s_compress_avx512_mainloop
vmovdqu %xmm0,(%rdi)
vmovdqu %xmm1,0x10(%rdi)
vmovdqu %xmm4,0x20(%rdi)
vzeroupper
retq
ENDPROC(blake2s_compress_avx512)
#endif /* CONFIG_AS_AVX512 */

View File

@@ -0,0 +1,232 @@
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <crypto/internal/blake2s.h>
#include <crypto/internal/hash.h>
#include <linux/types.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/cpufeature.h>
#include <asm/fpu/api.h>
#include <asm/processor.h>
#include <asm/simd.h>
asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
const u8 *block, const size_t nblocks,
const u32 inc);
asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
const u8 *block, const size_t nblocks,
const u32 inc);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
void blake2s_compress_arch(struct blake2s_state *state,
const u8 *block, size_t nblocks,
const u32 inc)
{
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
blake2s_compress_generic(state, block, nblocks, inc);
return;
}
do {
const size_t blocks = min_t(size_t, nblocks,
SZ_4K / BLAKE2S_BLOCK_SIZE);
kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) &&
static_branch_likely(&blake2s_use_avx512))
blake2s_compress_avx512(state, block, blocks, inc);
else
blake2s_compress_ssse3(state, block, blocks, inc);
kernel_fpu_end();
nblocks -= blocks;
block += blocks * BLAKE2S_BLOCK_SIZE;
} while (nblocks);
}
EXPORT_SYMBOL(blake2s_compress_arch);
static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
unsigned int keylen)
{
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
memcpy(tctx->key, key, keylen);
tctx->keylen = keylen;
return 0;
}
static int crypto_blake2s_init(struct shash_desc *desc)
{
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct blake2s_state *state = shash_desc_ctx(desc);
const int outlen = crypto_shash_digestsize(desc->tfm);
if (tctx->keylen)
blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
else
blake2s_init(state, outlen);
return 0;
}
static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
unsigned int inlen)
{
struct blake2s_state *state = shash_desc_ctx(desc);
const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
if (unlikely(!inlen))
return 0;
if (inlen > fill) {
memcpy(state->buf + state->buflen, in, fill);
blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
state->buflen = 0;
in += fill;
inlen -= fill;
}
if (inlen > BLAKE2S_BLOCK_SIZE) {
const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
/* Hash one less (full) block than strictly possible */
blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
}
memcpy(state->buf + state->buflen, in, inlen);
state->buflen += inlen;
return 0;
}
static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
{
struct blake2s_state *state = shash_desc_ctx(desc);
blake2s_set_lastblock(state);
memset(state->buf + state->buflen, 0,
BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
blake2s_compress_arch(state, state->buf, 1, state->buflen);
cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
memcpy(out, state->h, state->outlen);
memzero_explicit(state, sizeof(*state));
return 0;
}
static struct shash_alg blake2s_algs[] = {{
.base.cra_name = "blake2s-128",
.base.cra_driver_name = "blake2s-128-x86",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_128_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}, {
.base.cra_name = "blake2s-160",
.base.cra_driver_name = "blake2s-160-x86",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_160_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}, {
.base.cra_name = "blake2s-224",
.base.cra_driver_name = "blake2s-224-x86",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_224_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}, {
.base.cra_name = "blake2s-256",
.base.cra_driver_name = "blake2s-256-x86",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_256_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}};
static int __init blake2s_mod_init(void)
{
if (!boot_cpu_has(X86_FEATURE_SSSE3))
return 0;
static_branch_enable(&blake2s_use_ssse3);
if (IS_ENABLED(CONFIG_AS_AVX512) &&
boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
boot_cpu_has(X86_FEATURE_AVX512F) &&
boot_cpu_has(X86_FEATURE_AVX512VL) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
XFEATURE_MASK_AVX512, NULL))
static_branch_enable(&blake2s_use_avx512);
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
crypto_register_shashes(blake2s_algs,
ARRAY_SIZE(blake2s_algs)) : 0;
}
static void __exit blake2s_mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
}
module_init(blake2s_mod_init);
module_exit(blake2s_mod_exit);
MODULE_ALIAS_CRYPTO("blake2s-128");
MODULE_ALIAS_CRYPTO("blake2s-128-x86");
MODULE_ALIAS_CRYPTO("blake2s-160");
MODULE_ALIAS_CRYPTO("blake2s-160-x86");
MODULE_ALIAS_CRYPTO("blake2s-224");
MODULE_ALIAS_CRYPTO("blake2s-224-x86");
MODULE_ALIAS_CRYPTO("blake2s-256");
MODULE_ALIAS_CRYPTO("blake2s-256-x86");
MODULE_LICENSE("GPL v2");

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,836 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
* ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
*
* Copyright (C) 2018 Martin Willi
*/
#include <linux/linkage.h>
.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
.align 32
CTR2BL: .octa 0x00000000000000000000000000000000
.octa 0x00000000000000000000000000000001
.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
.align 32
CTR4BL: .octa 0x00000000000000000000000000000002
.octa 0x00000000000000000000000000000003
.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
.align 32
CTR8BL: .octa 0x00000003000000020000000100000000
.octa 0x00000007000000060000000500000004
.text
ENTRY(chacha_2block_xor_avx512vl)
# %rdi: Input state matrix, s
# %rsi: up to 2 data blocks output, o
# %rdx: up to 2 data blocks input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
# This function encrypts two ChaCha blocks by loading the state
# matrix twice across four AVX registers. It performs matrix operations
# on four words in each matrix in parallel, but requires shuffling to
# rearrange the words after each round.
vzeroupper
# x0..3[0-2] = s0..3
vbroadcasti128 0x00(%rdi),%ymm0
vbroadcasti128 0x10(%rdi),%ymm1
vbroadcasti128 0x20(%rdi),%ymm2
vbroadcasti128 0x30(%rdi),%ymm3
vpaddd CTR2BL(%rip),%ymm3,%ymm3
vmovdqa %ymm0,%ymm8
vmovdqa %ymm1,%ymm9
vmovdqa %ymm2,%ymm10
vmovdqa %ymm3,%ymm11
.Ldoubleround:
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $16,%ymm3,%ymm3
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $12,%ymm1,%ymm1
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $8,%ymm3,%ymm3
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $7,%ymm1,%ymm1
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
vpshufd $0x39,%ymm1,%ymm1
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vpshufd $0x4e,%ymm2,%ymm2
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
vpshufd $0x93,%ymm3,%ymm3
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $16,%ymm3,%ymm3
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $12,%ymm1,%ymm1
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $8,%ymm3,%ymm3
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $7,%ymm1,%ymm1
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
vpshufd $0x93,%ymm1,%ymm1
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vpshufd $0x4e,%ymm2,%ymm2
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
vpshufd $0x39,%ymm3,%ymm3
sub $2,%r8d
jnz .Ldoubleround
# o0 = i0 ^ (x0 + s0)
vpaddd %ymm8,%ymm0,%ymm7
cmp $0x10,%rcx
jl .Lxorpart2
vpxord 0x00(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x00(%rsi)
vextracti128 $1,%ymm7,%xmm0
# o1 = i1 ^ (x1 + s1)
vpaddd %ymm9,%ymm1,%ymm7
cmp $0x20,%rcx
jl .Lxorpart2
vpxord 0x10(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x10(%rsi)
vextracti128 $1,%ymm7,%xmm1
# o2 = i2 ^ (x2 + s2)
vpaddd %ymm10,%ymm2,%ymm7
cmp $0x30,%rcx
jl .Lxorpart2
vpxord 0x20(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x20(%rsi)
vextracti128 $1,%ymm7,%xmm2
# o3 = i3 ^ (x3 + s3)
vpaddd %ymm11,%ymm3,%ymm7
cmp $0x40,%rcx
jl .Lxorpart2
vpxord 0x30(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x30(%rsi)
vextracti128 $1,%ymm7,%xmm3
# xor and write second block
vmovdqa %xmm0,%xmm7
cmp $0x50,%rcx
jl .Lxorpart2
vpxord 0x40(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x40(%rsi)
vmovdqa %xmm1,%xmm7
cmp $0x60,%rcx
jl .Lxorpart2
vpxord 0x50(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x50(%rsi)
vmovdqa %xmm2,%xmm7
cmp $0x70,%rcx
jl .Lxorpart2
vpxord 0x60(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x60(%rsi)
vmovdqa %xmm3,%xmm7
cmp $0x80,%rcx
jl .Lxorpart2
vpxord 0x70(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x70(%rsi)
.Ldone2:
vzeroupper
ret
.Lxorpart2:
# xor remaining bytes from partial register into output
mov %rcx,%rax
and $0xf,%rcx
jz .Ldone8
mov %rax,%r9
and $~0xf,%r9
mov $1,%rax
shld %cl,%rax,%rax
sub $1,%rax
kmovq %rax,%k1
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
vpxord %xmm7,%xmm1,%xmm1
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
jmp .Ldone2
ENDPROC(chacha_2block_xor_avx512vl)
ENTRY(chacha_4block_xor_avx512vl)
# %rdi: Input state matrix, s
# %rsi: up to 4 data blocks output, o
# %rdx: up to 4 data blocks input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
# This function encrypts four ChaCha blocks by loading the state
# matrix four times across eight AVX registers. It performs matrix
# operations on four words in two matrices in parallel, sequentially
# to the operations on the four words of the other two matrices. The
# required word shuffling has a rather high latency, we can do the
# arithmetic on two matrix-pairs without much slowdown.
vzeroupper
# x0..3[0-4] = s0..3
vbroadcasti128 0x00(%rdi),%ymm0
vbroadcasti128 0x10(%rdi),%ymm1
vbroadcasti128 0x20(%rdi),%ymm2
vbroadcasti128 0x30(%rdi),%ymm3
vmovdqa %ymm0,%ymm4
vmovdqa %ymm1,%ymm5
vmovdqa %ymm2,%ymm6
vmovdqa %ymm3,%ymm7
vpaddd CTR2BL(%rip),%ymm3,%ymm3
vpaddd CTR4BL(%rip),%ymm7,%ymm7
vmovdqa %ymm0,%ymm11
vmovdqa %ymm1,%ymm12
vmovdqa %ymm2,%ymm13
vmovdqa %ymm3,%ymm14
vmovdqa %ymm7,%ymm15
.Ldoubleround4:
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $16,%ymm3,%ymm3
vpaddd %ymm5,%ymm4,%ymm4
vpxord %ymm4,%ymm7,%ymm7
vprold $16,%ymm7,%ymm7
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $12,%ymm1,%ymm1
vpaddd %ymm7,%ymm6,%ymm6
vpxord %ymm6,%ymm5,%ymm5
vprold $12,%ymm5,%ymm5
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $8,%ymm3,%ymm3
vpaddd %ymm5,%ymm4,%ymm4
vpxord %ymm4,%ymm7,%ymm7
vprold $8,%ymm7,%ymm7
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $7,%ymm1,%ymm1
vpaddd %ymm7,%ymm6,%ymm6
vpxord %ymm6,%ymm5,%ymm5
vprold $7,%ymm5,%ymm5
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
vpshufd $0x39,%ymm1,%ymm1
vpshufd $0x39,%ymm5,%ymm5
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vpshufd $0x4e,%ymm2,%ymm2
vpshufd $0x4e,%ymm6,%ymm6
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
vpshufd $0x93,%ymm3,%ymm3
vpshufd $0x93,%ymm7,%ymm7
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $16,%ymm3,%ymm3
vpaddd %ymm5,%ymm4,%ymm4
vpxord %ymm4,%ymm7,%ymm7
vprold $16,%ymm7,%ymm7
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $12,%ymm1,%ymm1
vpaddd %ymm7,%ymm6,%ymm6
vpxord %ymm6,%ymm5,%ymm5
vprold $12,%ymm5,%ymm5
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $8,%ymm3,%ymm3
vpaddd %ymm5,%ymm4,%ymm4
vpxord %ymm4,%ymm7,%ymm7
vprold $8,%ymm7,%ymm7
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $7,%ymm1,%ymm1
vpaddd %ymm7,%ymm6,%ymm6
vpxord %ymm6,%ymm5,%ymm5
vprold $7,%ymm5,%ymm5
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
vpshufd $0x93,%ymm1,%ymm1
vpshufd $0x93,%ymm5,%ymm5
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vpshufd $0x4e,%ymm2,%ymm2
vpshufd $0x4e,%ymm6,%ymm6
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
vpshufd $0x39,%ymm3,%ymm3
vpshufd $0x39,%ymm7,%ymm7
sub $2,%r8d
jnz .Ldoubleround4
# o0 = i0 ^ (x0 + s0), first block
vpaddd %ymm11,%ymm0,%ymm10
cmp $0x10,%rcx
jl .Lxorpart4
vpxord 0x00(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x00(%rsi)
vextracti128 $1,%ymm10,%xmm0
# o1 = i1 ^ (x1 + s1), first block
vpaddd %ymm12,%ymm1,%ymm10
cmp $0x20,%rcx
jl .Lxorpart4
vpxord 0x10(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x10(%rsi)
vextracti128 $1,%ymm10,%xmm1
# o2 = i2 ^ (x2 + s2), first block
vpaddd %ymm13,%ymm2,%ymm10
cmp $0x30,%rcx
jl .Lxorpart4
vpxord 0x20(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x20(%rsi)
vextracti128 $1,%ymm10,%xmm2
# o3 = i3 ^ (x3 + s3), first block
vpaddd %ymm14,%ymm3,%ymm10
cmp $0x40,%rcx
jl .Lxorpart4
vpxord 0x30(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x30(%rsi)
vextracti128 $1,%ymm10,%xmm3
# xor and write second block
vmovdqa %xmm0,%xmm10
cmp $0x50,%rcx
jl .Lxorpart4
vpxord 0x40(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x40(%rsi)
vmovdqa %xmm1,%xmm10
cmp $0x60,%rcx
jl .Lxorpart4
vpxord 0x50(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x50(%rsi)
vmovdqa %xmm2,%xmm10
cmp $0x70,%rcx
jl .Lxorpart4
vpxord 0x60(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x60(%rsi)
vmovdqa %xmm3,%xmm10
cmp $0x80,%rcx
jl .Lxorpart4
vpxord 0x70(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x70(%rsi)
# o0 = i0 ^ (x0 + s0), third block
vpaddd %ymm11,%ymm4,%ymm10
cmp $0x90,%rcx
jl .Lxorpart4
vpxord 0x80(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x80(%rsi)
vextracti128 $1,%ymm10,%xmm4
# o1 = i1 ^ (x1 + s1), third block
vpaddd %ymm12,%ymm5,%ymm10
cmp $0xa0,%rcx
jl .Lxorpart4
vpxord 0x90(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x90(%rsi)
vextracti128 $1,%ymm10,%xmm5
# o2 = i2 ^ (x2 + s2), third block
vpaddd %ymm13,%ymm6,%ymm10
cmp $0xb0,%rcx
jl .Lxorpart4
vpxord 0xa0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xa0(%rsi)
vextracti128 $1,%ymm10,%xmm6
# o3 = i3 ^ (x3 + s3), third block
vpaddd %ymm15,%ymm7,%ymm10
cmp $0xc0,%rcx
jl .Lxorpart4
vpxord 0xb0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xb0(%rsi)
vextracti128 $1,%ymm10,%xmm7
# xor and write fourth block
vmovdqa %xmm4,%xmm10
cmp $0xd0,%rcx
jl .Lxorpart4
vpxord 0xc0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xc0(%rsi)
vmovdqa %xmm5,%xmm10
cmp $0xe0,%rcx
jl .Lxorpart4
vpxord 0xd0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xd0(%rsi)
vmovdqa %xmm6,%xmm10
cmp $0xf0,%rcx
jl .Lxorpart4
vpxord 0xe0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xe0(%rsi)
vmovdqa %xmm7,%xmm10
cmp $0x100,%rcx
jl .Lxorpart4
vpxord 0xf0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xf0(%rsi)
.Ldone4:
vzeroupper
ret
.Lxorpart4:
# xor remaining bytes from partial register into output
mov %rcx,%rax
and $0xf,%rcx
jz .Ldone8
mov %rax,%r9
and $~0xf,%r9
mov $1,%rax
shld %cl,%rax,%rax
sub $1,%rax
kmovq %rax,%k1
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
vpxord %xmm10,%xmm1,%xmm1
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
jmp .Ldone4
ENDPROC(chacha_4block_xor_avx512vl)
ENTRY(chacha_8block_xor_avx512vl)
# %rdi: Input state matrix, s
# %rsi: up to 8 data blocks output, o
# %rdx: up to 8 data blocks input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
# This function encrypts eight consecutive ChaCha blocks by loading
# the state matrix in AVX registers eight times. Compared to AVX2, this
# mostly benefits from the new rotate instructions in VL and the
# additional registers.
vzeroupper
# x0..15[0-7] = s[0..15]
vpbroadcastd 0x00(%rdi),%ymm0
vpbroadcastd 0x04(%rdi),%ymm1
vpbroadcastd 0x08(%rdi),%ymm2
vpbroadcastd 0x0c(%rdi),%ymm3
vpbroadcastd 0x10(%rdi),%ymm4
vpbroadcastd 0x14(%rdi),%ymm5
vpbroadcastd 0x18(%rdi),%ymm6
vpbroadcastd 0x1c(%rdi),%ymm7
vpbroadcastd 0x20(%rdi),%ymm8
vpbroadcastd 0x24(%rdi),%ymm9
vpbroadcastd 0x28(%rdi),%ymm10
vpbroadcastd 0x2c(%rdi),%ymm11
vpbroadcastd 0x30(%rdi),%ymm12
vpbroadcastd 0x34(%rdi),%ymm13
vpbroadcastd 0x38(%rdi),%ymm14
vpbroadcastd 0x3c(%rdi),%ymm15
# x12 += counter values 0-3
vpaddd CTR8BL(%rip),%ymm12,%ymm12
vmovdqa64 %ymm0,%ymm16
vmovdqa64 %ymm1,%ymm17
vmovdqa64 %ymm2,%ymm18
vmovdqa64 %ymm3,%ymm19
vmovdqa64 %ymm4,%ymm20
vmovdqa64 %ymm5,%ymm21
vmovdqa64 %ymm6,%ymm22
vmovdqa64 %ymm7,%ymm23
vmovdqa64 %ymm8,%ymm24
vmovdqa64 %ymm9,%ymm25
vmovdqa64 %ymm10,%ymm26
vmovdqa64 %ymm11,%ymm27
vmovdqa64 %ymm12,%ymm28
vmovdqa64 %ymm13,%ymm29
vmovdqa64 %ymm14,%ymm30
vmovdqa64 %ymm15,%ymm31
.Ldoubleround8:
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
vpaddd %ymm0,%ymm4,%ymm0
vpxord %ymm0,%ymm12,%ymm12
vprold $16,%ymm12,%ymm12
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
vpaddd %ymm1,%ymm5,%ymm1
vpxord %ymm1,%ymm13,%ymm13
vprold $16,%ymm13,%ymm13
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
vpaddd %ymm2,%ymm6,%ymm2
vpxord %ymm2,%ymm14,%ymm14
vprold $16,%ymm14,%ymm14
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
vpaddd %ymm3,%ymm7,%ymm3
vpxord %ymm3,%ymm15,%ymm15
vprold $16,%ymm15,%ymm15
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
vpaddd %ymm12,%ymm8,%ymm8
vpxord %ymm8,%ymm4,%ymm4
vprold $12,%ymm4,%ymm4
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
vpaddd %ymm13,%ymm9,%ymm9
vpxord %ymm9,%ymm5,%ymm5
vprold $12,%ymm5,%ymm5
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
vpaddd %ymm14,%ymm10,%ymm10
vpxord %ymm10,%ymm6,%ymm6
vprold $12,%ymm6,%ymm6
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
vpaddd %ymm15,%ymm11,%ymm11
vpxord %ymm11,%ymm7,%ymm7
vprold $12,%ymm7,%ymm7
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
vpaddd %ymm0,%ymm4,%ymm0
vpxord %ymm0,%ymm12,%ymm12
vprold $8,%ymm12,%ymm12
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
vpaddd %ymm1,%ymm5,%ymm1
vpxord %ymm1,%ymm13,%ymm13
vprold $8,%ymm13,%ymm13
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
vpaddd %ymm2,%ymm6,%ymm2
vpxord %ymm2,%ymm14,%ymm14
vprold $8,%ymm14,%ymm14
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
vpaddd %ymm3,%ymm7,%ymm3
vpxord %ymm3,%ymm15,%ymm15
vprold $8,%ymm15,%ymm15
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
vpaddd %ymm12,%ymm8,%ymm8
vpxord %ymm8,%ymm4,%ymm4
vprold $7,%ymm4,%ymm4
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
vpaddd %ymm13,%ymm9,%ymm9
vpxord %ymm9,%ymm5,%ymm5
vprold $7,%ymm5,%ymm5
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
vpaddd %ymm14,%ymm10,%ymm10
vpxord %ymm10,%ymm6,%ymm6
vprold $7,%ymm6,%ymm6
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
vpaddd %ymm15,%ymm11,%ymm11
vpxord %ymm11,%ymm7,%ymm7
vprold $7,%ymm7,%ymm7
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
vpaddd %ymm0,%ymm5,%ymm0
vpxord %ymm0,%ymm15,%ymm15
vprold $16,%ymm15,%ymm15
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
vpaddd %ymm1,%ymm6,%ymm1
vpxord %ymm1,%ymm12,%ymm12
vprold $16,%ymm12,%ymm12
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
vpaddd %ymm2,%ymm7,%ymm2
vpxord %ymm2,%ymm13,%ymm13
vprold $16,%ymm13,%ymm13
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
vpaddd %ymm3,%ymm4,%ymm3
vpxord %ymm3,%ymm14,%ymm14
vprold $16,%ymm14,%ymm14
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
vpaddd %ymm15,%ymm10,%ymm10
vpxord %ymm10,%ymm5,%ymm5
vprold $12,%ymm5,%ymm5
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
vpaddd %ymm12,%ymm11,%ymm11
vpxord %ymm11,%ymm6,%ymm6
vprold $12,%ymm6,%ymm6
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
vpaddd %ymm13,%ymm8,%ymm8
vpxord %ymm8,%ymm7,%ymm7
vprold $12,%ymm7,%ymm7
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
vpaddd %ymm14,%ymm9,%ymm9
vpxord %ymm9,%ymm4,%ymm4
vprold $12,%ymm4,%ymm4
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
vpaddd %ymm0,%ymm5,%ymm0
vpxord %ymm0,%ymm15,%ymm15
vprold $8,%ymm15,%ymm15
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
vpaddd %ymm1,%ymm6,%ymm1
vpxord %ymm1,%ymm12,%ymm12
vprold $8,%ymm12,%ymm12
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
vpaddd %ymm2,%ymm7,%ymm2
vpxord %ymm2,%ymm13,%ymm13
vprold $8,%ymm13,%ymm13
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
vpaddd %ymm3,%ymm4,%ymm3
vpxord %ymm3,%ymm14,%ymm14
vprold $8,%ymm14,%ymm14
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
vpaddd %ymm15,%ymm10,%ymm10
vpxord %ymm10,%ymm5,%ymm5
vprold $7,%ymm5,%ymm5
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
vpaddd %ymm12,%ymm11,%ymm11
vpxord %ymm11,%ymm6,%ymm6
vprold $7,%ymm6,%ymm6
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
vpaddd %ymm13,%ymm8,%ymm8
vpxord %ymm8,%ymm7,%ymm7
vprold $7,%ymm7,%ymm7
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
vpaddd %ymm14,%ymm9,%ymm9
vpxord %ymm9,%ymm4,%ymm4
vprold $7,%ymm4,%ymm4
sub $2,%r8d
jnz .Ldoubleround8
# x0..15[0-3] += s[0..15]
vpaddd %ymm16,%ymm0,%ymm0
vpaddd %ymm17,%ymm1,%ymm1
vpaddd %ymm18,%ymm2,%ymm2
vpaddd %ymm19,%ymm3,%ymm3
vpaddd %ymm20,%ymm4,%ymm4
vpaddd %ymm21,%ymm5,%ymm5
vpaddd %ymm22,%ymm6,%ymm6
vpaddd %ymm23,%ymm7,%ymm7
vpaddd %ymm24,%ymm8,%ymm8
vpaddd %ymm25,%ymm9,%ymm9
vpaddd %ymm26,%ymm10,%ymm10
vpaddd %ymm27,%ymm11,%ymm11
vpaddd %ymm28,%ymm12,%ymm12
vpaddd %ymm29,%ymm13,%ymm13
vpaddd %ymm30,%ymm14,%ymm14
vpaddd %ymm31,%ymm15,%ymm15
# interleave 32-bit words in state n, n+1
vpunpckldq %ymm1,%ymm0,%ymm16
vpunpckhdq %ymm1,%ymm0,%ymm17
vpunpckldq %ymm3,%ymm2,%ymm18
vpunpckhdq %ymm3,%ymm2,%ymm19
vpunpckldq %ymm5,%ymm4,%ymm20
vpunpckhdq %ymm5,%ymm4,%ymm21
vpunpckldq %ymm7,%ymm6,%ymm22
vpunpckhdq %ymm7,%ymm6,%ymm23
vpunpckldq %ymm9,%ymm8,%ymm24
vpunpckhdq %ymm9,%ymm8,%ymm25
vpunpckldq %ymm11,%ymm10,%ymm26
vpunpckhdq %ymm11,%ymm10,%ymm27
vpunpckldq %ymm13,%ymm12,%ymm28
vpunpckhdq %ymm13,%ymm12,%ymm29
vpunpckldq %ymm15,%ymm14,%ymm30
vpunpckhdq %ymm15,%ymm14,%ymm31
# interleave 64-bit words in state n, n+2
vpunpcklqdq %ymm18,%ymm16,%ymm0
vpunpcklqdq %ymm19,%ymm17,%ymm1
vpunpckhqdq %ymm18,%ymm16,%ymm2
vpunpckhqdq %ymm19,%ymm17,%ymm3
vpunpcklqdq %ymm22,%ymm20,%ymm4
vpunpcklqdq %ymm23,%ymm21,%ymm5
vpunpckhqdq %ymm22,%ymm20,%ymm6
vpunpckhqdq %ymm23,%ymm21,%ymm7
vpunpcklqdq %ymm26,%ymm24,%ymm8
vpunpcklqdq %ymm27,%ymm25,%ymm9
vpunpckhqdq %ymm26,%ymm24,%ymm10
vpunpckhqdq %ymm27,%ymm25,%ymm11
vpunpcklqdq %ymm30,%ymm28,%ymm12
vpunpcklqdq %ymm31,%ymm29,%ymm13
vpunpckhqdq %ymm30,%ymm28,%ymm14
vpunpckhqdq %ymm31,%ymm29,%ymm15
# interleave 128-bit words in state n, n+4
# xor/write first four blocks
vmovdqa64 %ymm0,%ymm16
vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
cmp $0x0020,%rcx
jl .Lxorpart8
vpxord 0x0000(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0000(%rsi)
vmovdqa64 %ymm16,%ymm0
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
cmp $0x0040,%rcx
jl .Lxorpart8
vpxord 0x0020(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0020(%rsi)
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
cmp $0x0060,%rcx
jl .Lxorpart8
vpxord 0x0040(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0040(%rsi)
vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
cmp $0x0080,%rcx
jl .Lxorpart8
vpxord 0x0060(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0060(%rsi)
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
cmp $0x00a0,%rcx
jl .Lxorpart8
vpxord 0x0080(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0080(%rsi)
vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
cmp $0x00c0,%rcx
jl .Lxorpart8
vpxord 0x00a0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x00a0(%rsi)
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
cmp $0x00e0,%rcx
jl .Lxorpart8
vpxord 0x00c0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x00c0(%rsi)
vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
cmp $0x0100,%rcx
jl .Lxorpart8
vpxord 0x00e0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x00e0(%rsi)
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
# xor remaining blocks, write to output
vmovdqa64 %ymm4,%ymm0
cmp $0x0120,%rcx
jl .Lxorpart8
vpxord 0x0100(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0100(%rsi)
vmovdqa64 %ymm12,%ymm0
cmp $0x0140,%rcx
jl .Lxorpart8
vpxord 0x0120(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0120(%rsi)
vmovdqa64 %ymm6,%ymm0
cmp $0x0160,%rcx
jl .Lxorpart8
vpxord 0x0140(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0140(%rsi)
vmovdqa64 %ymm14,%ymm0
cmp $0x0180,%rcx
jl .Lxorpart8
vpxord 0x0160(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0160(%rsi)
vmovdqa64 %ymm5,%ymm0
cmp $0x01a0,%rcx
jl .Lxorpart8
vpxord 0x0180(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0180(%rsi)
vmovdqa64 %ymm13,%ymm0
cmp $0x01c0,%rcx
jl .Lxorpart8
vpxord 0x01a0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x01a0(%rsi)
vmovdqa64 %ymm7,%ymm0
cmp $0x01e0,%rcx
jl .Lxorpart8
vpxord 0x01c0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x01c0(%rsi)
vmovdqa64 %ymm15,%ymm0
cmp $0x0200,%rcx
jl .Lxorpart8
vpxord 0x01e0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x01e0(%rsi)
.Ldone8:
vzeroupper
ret
.Lxorpart8:
# xor remaining bytes from partial register into output
mov %rcx,%rax
and $0x1f,%rcx
jz .Ldone8
mov %rax,%r9
and $~0x1f,%r9
mov $1,%rax
shld %cl,%rax,%rax
sub $1,%rax
kmovq %rax,%k1
vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
vpxord %ymm0,%ymm1,%ymm1
vmovdqu8 %ymm1,(%rsi,%r9){%k1}
jmp .Ldone8
ENDPROC(chacha_8block_xor_avx512vl)

View File

@@ -1,5 +1,5 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
* ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
*
* Copyright (C) 2015 Martin Willi
*
@@ -10,6 +10,7 @@
*/
#include <linux/linkage.h>
#include <asm/frame.h>
.section .rodata.cst16.ROT8, "aM", @progbits, 16
.align 16
@@ -23,35 +24,25 @@ CTRINC: .octa 0x00000003000000020000000100000000
.text
ENTRY(chacha20_block_xor_ssse3)
# %rdi: Input state matrix, s
# %rsi: 1 data block output, o
# %rdx: 1 data block input, i
# This function encrypts one ChaCha20 block by loading the state matrix
# in four SSE registers. It performs matrix operation on four words in
# parallel, but requireds shuffling to rearrange the words after each
# round. 8/16-bit word rotation is done with the slightly better
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
# traditional shift+OR.
# x0..3 = s0..3
movdqa 0x00(%rdi),%xmm0
movdqa 0x10(%rdi),%xmm1
movdqa 0x20(%rdi),%xmm2
movdqa 0x30(%rdi),%xmm3
movdqa %xmm0,%xmm8
movdqa %xmm1,%xmm9
movdqa %xmm2,%xmm10
movdqa %xmm3,%xmm11
/*
* chacha_permute - permute one block
*
* Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
* function performs matrix operations on four words in parallel, but requires
* shuffling to rearrange the words after each round. 8/16-bit word rotation is
* done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
* rotation uses traditional shift+OR.
*
* The round count is given in %r8d.
*
* Clobbers: %r8d, %xmm4-%xmm7
*/
chacha_permute:
movdqa ROT8(%rip),%xmm4
movdqa ROT16(%rip),%xmm5
mov $10,%ecx
.Ldoubleround:
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
@@ -118,39 +109,129 @@ ENTRY(chacha20_block_xor_ssse3)
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
pshufd $0x39,%xmm3,%xmm3
dec %ecx
sub $2,%r8d
jnz .Ldoubleround
ret
ENDPROC(chacha_permute)
ENTRY(chacha_block_xor_ssse3)
# %rdi: Input state matrix, s
# %rsi: up to 1 data block output, o
# %rdx: up to 1 data block input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
FRAME_BEGIN
# x0..3 = s0..3
movdqu 0x00(%rdi),%xmm0
movdqu 0x10(%rdi),%xmm1
movdqu 0x20(%rdi),%xmm2
movdqu 0x30(%rdi),%xmm3
movdqa %xmm0,%xmm8
movdqa %xmm1,%xmm9
movdqa %xmm2,%xmm10
movdqa %xmm3,%xmm11
mov %rcx,%rax
call chacha_permute
# o0 = i0 ^ (x0 + s0)
movdqu 0x00(%rdx),%xmm4
paddd %xmm8,%xmm0
cmp $0x10,%rax
jl .Lxorpart
movdqu 0x00(%rdx),%xmm4
pxor %xmm4,%xmm0
movdqu %xmm0,0x00(%rsi)
# o1 = i1 ^ (x1 + s1)
movdqu 0x10(%rdx),%xmm5
paddd %xmm9,%xmm1
pxor %xmm5,%xmm1
movdqu %xmm1,0x10(%rsi)
movdqa %xmm1,%xmm0
cmp $0x20,%rax
jl .Lxorpart
movdqu 0x10(%rdx),%xmm0
pxor %xmm1,%xmm0
movdqu %xmm0,0x10(%rsi)
# o2 = i2 ^ (x2 + s2)
movdqu 0x20(%rdx),%xmm6
paddd %xmm10,%xmm2
pxor %xmm6,%xmm2
movdqu %xmm2,0x20(%rsi)
movdqa %xmm2,%xmm0
cmp $0x30,%rax
jl .Lxorpart
movdqu 0x20(%rdx),%xmm0
pxor %xmm2,%xmm0
movdqu %xmm0,0x20(%rsi)
# o3 = i3 ^ (x3 + s3)
movdqu 0x30(%rdx),%xmm7
paddd %xmm11,%xmm3
pxor %xmm7,%xmm3
movdqu %xmm3,0x30(%rsi)
movdqa %xmm3,%xmm0
cmp $0x40,%rax
jl .Lxorpart
movdqu 0x30(%rdx),%xmm0
pxor %xmm3,%xmm0
movdqu %xmm0,0x30(%rsi)
.Ldone:
FRAME_END
ret
ENDPROC(chacha20_block_xor_ssse3)
ENTRY(chacha20_4block_xor_ssse3)
.Lxorpart:
# xor remaining bytes from partial register into output
mov %rax,%r9
and $0x0f,%r9
jz .Ldone
and $~0x0f,%rax
mov %rsi,%r11
lea 8(%rsp),%r10
sub $0x10,%rsp
and $~31,%rsp
lea (%rdx,%rax),%rsi
mov %rsp,%rdi
mov %r9,%rcx
rep movsb
pxor 0x00(%rsp),%xmm0
movdqa %xmm0,0x00(%rsp)
mov %rsp,%rsi
lea (%r11,%rax),%rdi
mov %r9,%rcx
rep movsb
lea -8(%r10),%rsp
jmp .Ldone
ENDPROC(chacha_block_xor_ssse3)
ENTRY(hchacha_block_ssse3)
# %rdi: Input state matrix, s
# %rsi: 4 data blocks output, o
# %rdx: 4 data blocks input, i
# %rsi: output (8 32-bit words)
# %edx: nrounds
FRAME_BEGIN
# This function encrypts four consecutive ChaCha20 blocks by loading the
movdqu 0x00(%rdi),%xmm0
movdqu 0x10(%rdi),%xmm1
movdqu 0x20(%rdi),%xmm2
movdqu 0x30(%rdi),%xmm3
mov %edx,%r8d
call chacha_permute
movdqu %xmm0,0x00(%rsi)
movdqu %xmm3,0x10(%rsi)
FRAME_END
ret
ENDPROC(hchacha_block_ssse3)
ENTRY(chacha_4block_xor_ssse3)
# %rdi: Input state matrix, s
# %rsi: up to 4 data blocks output, o
# %rdx: up to 4 data blocks input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
# This function encrypts four consecutive ChaCha blocks by loading the
# the state matrix in SSE registers four times. As we need some scratch
# registers, we save the first four registers on the stack. The
# algorithm performs each operation on the corresponding word of each
@@ -163,6 +244,7 @@ ENTRY(chacha20_4block_xor_ssse3)
lea 8(%rsp),%r10
sub $0x80,%rsp
and $~63,%rsp
mov %rcx,%rax
# x0..15[0-3] = s0..3[0..3]
movq 0x00(%rdi),%xmm1
@@ -202,8 +284,6 @@ ENTRY(chacha20_4block_xor_ssse3)
# x12 += counter values 0-3
paddd %xmm1,%xmm12
mov $10,%ecx
.Ldoubleround4:
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
movdqa 0x00(%rsp),%xmm0
@@ -421,7 +501,7 @@ ENTRY(chacha20_4block_xor_ssse3)
psrld $25,%xmm4
por %xmm0,%xmm4
dec %ecx
sub $2,%r8d
jnz .Ldoubleround4
# x0[0-3] += s0[0]
@@ -573,58 +653,143 @@ ENTRY(chacha20_4block_xor_ssse3)
# xor with corresponding input, write to output
movdqa 0x00(%rsp),%xmm0
cmp $0x10,%rax
jl .Lxorpart4
movdqu 0x00(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x00(%rsi)
movdqa 0x10(%rsp),%xmm0
movdqu 0x80(%rdx),%xmm1
movdqu %xmm4,%xmm0
cmp $0x20,%rax
jl .Lxorpart4
movdqu 0x10(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x80(%rsi)
movdqu %xmm0,0x10(%rsi)
movdqu %xmm8,%xmm0
cmp $0x30,%rax
jl .Lxorpart4
movdqu 0x20(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x20(%rsi)
movdqu %xmm12,%xmm0
cmp $0x40,%rax
jl .Lxorpart4
movdqu 0x30(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x30(%rsi)
movdqa 0x20(%rsp),%xmm0
cmp $0x50,%rax
jl .Lxorpart4
movdqu 0x40(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x40(%rsi)
movdqu %xmm6,%xmm0
cmp $0x60,%rax
jl .Lxorpart4
movdqu 0x50(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x50(%rsi)
movdqu %xmm10,%xmm0
cmp $0x70,%rax
jl .Lxorpart4
movdqu 0x60(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x60(%rsi)
movdqu %xmm14,%xmm0
cmp $0x80,%rax
jl .Lxorpart4
movdqu 0x70(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x70(%rsi)
movdqa 0x10(%rsp),%xmm0
cmp $0x90,%rax
jl .Lxorpart4
movdqu 0x80(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x80(%rsi)
movdqu %xmm5,%xmm0
cmp $0xa0,%rax
jl .Lxorpart4
movdqu 0x90(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x90(%rsi)
movdqu %xmm9,%xmm0
cmp $0xb0,%rax
jl .Lxorpart4
movdqu 0xa0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xa0(%rsi)
movdqu %xmm13,%xmm0
cmp $0xc0,%rax
jl .Lxorpart4
movdqu 0xb0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xb0(%rsi)
movdqa 0x30(%rsp),%xmm0
cmp $0xd0,%rax
jl .Lxorpart4
movdqu 0xc0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xc0(%rsi)
movdqu 0x10(%rdx),%xmm1
pxor %xmm1,%xmm4
movdqu %xmm4,0x10(%rsi)
movdqu 0x90(%rdx),%xmm1
pxor %xmm1,%xmm5
movdqu %xmm5,0x90(%rsi)
movdqu 0x50(%rdx),%xmm1
pxor %xmm1,%xmm6
movdqu %xmm6,0x50(%rsi)
movdqu 0xd0(%rdx),%xmm1
pxor %xmm1,%xmm7
movdqu %xmm7,0xd0(%rsi)
movdqu 0x20(%rdx),%xmm1
pxor %xmm1,%xmm8
movdqu %xmm8,0x20(%rsi)
movdqu 0xa0(%rdx),%xmm1
pxor %xmm1,%xmm9
movdqu %xmm9,0xa0(%rsi)
movdqu 0x60(%rdx),%xmm1
pxor %xmm1,%xmm10
movdqu %xmm10,0x60(%rsi)
movdqu 0xe0(%rdx),%xmm1
pxor %xmm1,%xmm11
movdqu %xmm11,0xe0(%rsi)
movdqu 0x30(%rdx),%xmm1
pxor %xmm1,%xmm12
movdqu %xmm12,0x30(%rsi)
movdqu 0xb0(%rdx),%xmm1
pxor %xmm1,%xmm13
movdqu %xmm13,0xb0(%rsi)
movdqu 0x70(%rdx),%xmm1
pxor %xmm1,%xmm14
movdqu %xmm14,0x70(%rsi)
movdqu 0xf0(%rdx),%xmm1
pxor %xmm1,%xmm15
movdqu %xmm15,0xf0(%rsi)
movdqu %xmm7,%xmm0
cmp $0xe0,%rax
jl .Lxorpart4
movdqu 0xd0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xd0(%rsi)
movdqu %xmm11,%xmm0
cmp $0xf0,%rax
jl .Lxorpart4
movdqu 0xe0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xe0(%rsi)
movdqu %xmm15,%xmm0
cmp $0x100,%rax
jl .Lxorpart4
movdqu 0xf0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xf0(%rsi)
.Ldone4:
lea -8(%r10),%rsp
ret
ENDPROC(chacha20_4block_xor_ssse3)
.Lxorpart4:
# xor remaining bytes from partial register into output
mov %rax,%r9
and $0x0f,%r9
jz .Ldone4
and $~0x0f,%rax
mov %rsi,%r11
lea (%rdx,%rax),%rsi
mov %rsp,%rdi
mov %r9,%rcx
rep movsb
pxor 0x00(%rsp),%xmm0
movdqa %xmm0,0x00(%rsp)
mov %rsp,%rsi
lea (%r11,%rax),%rdi
mov %r9,%rcx
rep movsb
jmp .Ldone4
ENDPROC(chacha_4block_xor_ssse3)

View File

@@ -1,448 +0,0 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/linkage.h>
.section .rodata.cst32.ROT8, "aM", @progbits, 32
.align 32
ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
.octa 0x0e0d0c0f0a09080b0605040702010003
.section .rodata.cst32.ROT16, "aM", @progbits, 32
.align 32
ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
.octa 0x0d0c0f0e09080b0a0504070601000302
.section .rodata.cst32.CTRINC, "aM", @progbits, 32
.align 32
CTRINC: .octa 0x00000003000000020000000100000000
.octa 0x00000007000000060000000500000004
.text
ENTRY(chacha20_8block_xor_avx2)
# %rdi: Input state matrix, s
# %rsi: 8 data blocks output, o
# %rdx: 8 data blocks input, i
# This function encrypts eight consecutive ChaCha20 blocks by loading
# the state matrix in AVX registers eight times. As we need some
# scratch registers, we save the first four registers on the stack. The
# algorithm performs each operation on the corresponding word of each
# state matrix, hence requires no word shuffling. For final XORing step
# we transpose the matrix by interleaving 32-, 64- and then 128-bit
# words, which allows us to do XOR in AVX registers. 8/16-bit word
# rotation is done with the slightly better performing byte shuffling,
# 7/12-bit word rotation uses traditional shift+OR.
vzeroupper
# 4 * 32 byte stack, 32-byte aligned
lea 8(%rsp),%r10
and $~31, %rsp
sub $0x80, %rsp
# x0..15[0-7] = s[0..15]
vpbroadcastd 0x00(%rdi),%ymm0
vpbroadcastd 0x04(%rdi),%ymm1
vpbroadcastd 0x08(%rdi),%ymm2
vpbroadcastd 0x0c(%rdi),%ymm3
vpbroadcastd 0x10(%rdi),%ymm4
vpbroadcastd 0x14(%rdi),%ymm5
vpbroadcastd 0x18(%rdi),%ymm6
vpbroadcastd 0x1c(%rdi),%ymm7
vpbroadcastd 0x20(%rdi),%ymm8
vpbroadcastd 0x24(%rdi),%ymm9
vpbroadcastd 0x28(%rdi),%ymm10
vpbroadcastd 0x2c(%rdi),%ymm11
vpbroadcastd 0x30(%rdi),%ymm12
vpbroadcastd 0x34(%rdi),%ymm13
vpbroadcastd 0x38(%rdi),%ymm14
vpbroadcastd 0x3c(%rdi),%ymm15
# x0..3 on stack
vmovdqa %ymm0,0x00(%rsp)
vmovdqa %ymm1,0x20(%rsp)
vmovdqa %ymm2,0x40(%rsp)
vmovdqa %ymm3,0x60(%rsp)
vmovdqa CTRINC(%rip),%ymm1
vmovdqa ROT8(%rip),%ymm2
vmovdqa ROT16(%rip),%ymm3
# x12 += counter values 0-3
vpaddd %ymm1,%ymm12,%ymm12
mov $10,%ecx
.Ldoubleround8:
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
vpaddd 0x00(%rsp),%ymm4,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpxor %ymm0,%ymm12,%ymm12
vpshufb %ymm3,%ymm12,%ymm12
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
vpaddd 0x20(%rsp),%ymm5,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpxor %ymm0,%ymm13,%ymm13
vpshufb %ymm3,%ymm13,%ymm13
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
vpaddd 0x40(%rsp),%ymm6,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpxor %ymm0,%ymm14,%ymm14
vpshufb %ymm3,%ymm14,%ymm14
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
vpaddd 0x60(%rsp),%ymm7,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpxor %ymm0,%ymm15,%ymm15
vpshufb %ymm3,%ymm15,%ymm15
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
vpaddd %ymm12,%ymm8,%ymm8
vpxor %ymm8,%ymm4,%ymm4
vpslld $12,%ymm4,%ymm0
vpsrld $20,%ymm4,%ymm4
vpor %ymm0,%ymm4,%ymm4
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
vpaddd %ymm13,%ymm9,%ymm9
vpxor %ymm9,%ymm5,%ymm5
vpslld $12,%ymm5,%ymm0
vpsrld $20,%ymm5,%ymm5
vpor %ymm0,%ymm5,%ymm5
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
vpaddd %ymm14,%ymm10,%ymm10
vpxor %ymm10,%ymm6,%ymm6
vpslld $12,%ymm6,%ymm0
vpsrld $20,%ymm6,%ymm6
vpor %ymm0,%ymm6,%ymm6
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
vpaddd %ymm15,%ymm11,%ymm11
vpxor %ymm11,%ymm7,%ymm7
vpslld $12,%ymm7,%ymm0
vpsrld $20,%ymm7,%ymm7
vpor %ymm0,%ymm7,%ymm7
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
vpaddd 0x00(%rsp),%ymm4,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpxor %ymm0,%ymm12,%ymm12
vpshufb %ymm2,%ymm12,%ymm12
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
vpaddd 0x20(%rsp),%ymm5,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpxor %ymm0,%ymm13,%ymm13
vpshufb %ymm2,%ymm13,%ymm13
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
vpaddd 0x40(%rsp),%ymm6,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpxor %ymm0,%ymm14,%ymm14
vpshufb %ymm2,%ymm14,%ymm14
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
vpaddd 0x60(%rsp),%ymm7,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpxor %ymm0,%ymm15,%ymm15
vpshufb %ymm2,%ymm15,%ymm15
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
vpaddd %ymm12,%ymm8,%ymm8
vpxor %ymm8,%ymm4,%ymm4
vpslld $7,%ymm4,%ymm0
vpsrld $25,%ymm4,%ymm4
vpor %ymm0,%ymm4,%ymm4
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
vpaddd %ymm13,%ymm9,%ymm9
vpxor %ymm9,%ymm5,%ymm5
vpslld $7,%ymm5,%ymm0
vpsrld $25,%ymm5,%ymm5
vpor %ymm0,%ymm5,%ymm5
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
vpaddd %ymm14,%ymm10,%ymm10
vpxor %ymm10,%ymm6,%ymm6
vpslld $7,%ymm6,%ymm0
vpsrld $25,%ymm6,%ymm6
vpor %ymm0,%ymm6,%ymm6
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
vpaddd %ymm15,%ymm11,%ymm11
vpxor %ymm11,%ymm7,%ymm7
vpslld $7,%ymm7,%ymm0
vpsrld $25,%ymm7,%ymm7
vpor %ymm0,%ymm7,%ymm7
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
vpaddd 0x00(%rsp),%ymm5,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpxor %ymm0,%ymm15,%ymm15
vpshufb %ymm3,%ymm15,%ymm15
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
vpaddd 0x20(%rsp),%ymm6,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpxor %ymm0,%ymm12,%ymm12
vpshufb %ymm3,%ymm12,%ymm12
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
vpaddd 0x40(%rsp),%ymm7,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpxor %ymm0,%ymm13,%ymm13
vpshufb %ymm3,%ymm13,%ymm13
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
vpaddd 0x60(%rsp),%ymm4,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpxor %ymm0,%ymm14,%ymm14
vpshufb %ymm3,%ymm14,%ymm14
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
vpaddd %ymm15,%ymm10,%ymm10
vpxor %ymm10,%ymm5,%ymm5
vpslld $12,%ymm5,%ymm0
vpsrld $20,%ymm5,%ymm5
vpor %ymm0,%ymm5,%ymm5
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
vpaddd %ymm12,%ymm11,%ymm11
vpxor %ymm11,%ymm6,%ymm6
vpslld $12,%ymm6,%ymm0
vpsrld $20,%ymm6,%ymm6
vpor %ymm0,%ymm6,%ymm6
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
vpaddd %ymm13,%ymm8,%ymm8
vpxor %ymm8,%ymm7,%ymm7
vpslld $12,%ymm7,%ymm0
vpsrld $20,%ymm7,%ymm7
vpor %ymm0,%ymm7,%ymm7
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
vpaddd %ymm14,%ymm9,%ymm9
vpxor %ymm9,%ymm4,%ymm4
vpslld $12,%ymm4,%ymm0
vpsrld $20,%ymm4,%ymm4
vpor %ymm0,%ymm4,%ymm4
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
vpaddd 0x00(%rsp),%ymm5,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpxor %ymm0,%ymm15,%ymm15
vpshufb %ymm2,%ymm15,%ymm15
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
vpaddd 0x20(%rsp),%ymm6,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpxor %ymm0,%ymm12,%ymm12
vpshufb %ymm2,%ymm12,%ymm12
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
vpaddd 0x40(%rsp),%ymm7,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpxor %ymm0,%ymm13,%ymm13
vpshufb %ymm2,%ymm13,%ymm13
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
vpaddd 0x60(%rsp),%ymm4,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpxor %ymm0,%ymm14,%ymm14
vpshufb %ymm2,%ymm14,%ymm14
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
vpaddd %ymm15,%ymm10,%ymm10
vpxor %ymm10,%ymm5,%ymm5
vpslld $7,%ymm5,%ymm0
vpsrld $25,%ymm5,%ymm5
vpor %ymm0,%ymm5,%ymm5
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
vpaddd %ymm12,%ymm11,%ymm11
vpxor %ymm11,%ymm6,%ymm6
vpslld $7,%ymm6,%ymm0
vpsrld $25,%ymm6,%ymm6
vpor %ymm0,%ymm6,%ymm6
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
vpaddd %ymm13,%ymm8,%ymm8
vpxor %ymm8,%ymm7,%ymm7
vpslld $7,%ymm7,%ymm0
vpsrld $25,%ymm7,%ymm7
vpor %ymm0,%ymm7,%ymm7
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
vpaddd %ymm14,%ymm9,%ymm9
vpxor %ymm9,%ymm4,%ymm4
vpslld $7,%ymm4,%ymm0
vpsrld $25,%ymm4,%ymm4
vpor %ymm0,%ymm4,%ymm4
dec %ecx
jnz .Ldoubleround8
# x0..15[0-3] += s[0..15]
vpbroadcastd 0x00(%rdi),%ymm0
vpaddd 0x00(%rsp),%ymm0,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpbroadcastd 0x04(%rdi),%ymm0
vpaddd 0x20(%rsp),%ymm0,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpbroadcastd 0x08(%rdi),%ymm0
vpaddd 0x40(%rsp),%ymm0,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpbroadcastd 0x0c(%rdi),%ymm0
vpaddd 0x60(%rsp),%ymm0,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpbroadcastd 0x10(%rdi),%ymm0
vpaddd %ymm0,%ymm4,%ymm4
vpbroadcastd 0x14(%rdi),%ymm0
vpaddd %ymm0,%ymm5,%ymm5
vpbroadcastd 0x18(%rdi),%ymm0
vpaddd %ymm0,%ymm6,%ymm6
vpbroadcastd 0x1c(%rdi),%ymm0
vpaddd %ymm0,%ymm7,%ymm7
vpbroadcastd 0x20(%rdi),%ymm0
vpaddd %ymm0,%ymm8,%ymm8
vpbroadcastd 0x24(%rdi),%ymm0
vpaddd %ymm0,%ymm9,%ymm9
vpbroadcastd 0x28(%rdi),%ymm0
vpaddd %ymm0,%ymm10,%ymm10
vpbroadcastd 0x2c(%rdi),%ymm0
vpaddd %ymm0,%ymm11,%ymm11
vpbroadcastd 0x30(%rdi),%ymm0
vpaddd %ymm0,%ymm12,%ymm12
vpbroadcastd 0x34(%rdi),%ymm0
vpaddd %ymm0,%ymm13,%ymm13
vpbroadcastd 0x38(%rdi),%ymm0
vpaddd %ymm0,%ymm14,%ymm14
vpbroadcastd 0x3c(%rdi),%ymm0
vpaddd %ymm0,%ymm15,%ymm15
# x12 += counter values 0-3
vpaddd %ymm1,%ymm12,%ymm12
# interleave 32-bit words in state n, n+1
vmovdqa 0x00(%rsp),%ymm0
vmovdqa 0x20(%rsp),%ymm1
vpunpckldq %ymm1,%ymm0,%ymm2
vpunpckhdq %ymm1,%ymm0,%ymm1
vmovdqa %ymm2,0x00(%rsp)
vmovdqa %ymm1,0x20(%rsp)
vmovdqa 0x40(%rsp),%ymm0
vmovdqa 0x60(%rsp),%ymm1
vpunpckldq %ymm1,%ymm0,%ymm2
vpunpckhdq %ymm1,%ymm0,%ymm1
vmovdqa %ymm2,0x40(%rsp)
vmovdqa %ymm1,0x60(%rsp)
vmovdqa %ymm4,%ymm0
vpunpckldq %ymm5,%ymm0,%ymm4
vpunpckhdq %ymm5,%ymm0,%ymm5
vmovdqa %ymm6,%ymm0
vpunpckldq %ymm7,%ymm0,%ymm6
vpunpckhdq %ymm7,%ymm0,%ymm7
vmovdqa %ymm8,%ymm0
vpunpckldq %ymm9,%ymm0,%ymm8
vpunpckhdq %ymm9,%ymm0,%ymm9
vmovdqa %ymm10,%ymm0
vpunpckldq %ymm11,%ymm0,%ymm10
vpunpckhdq %ymm11,%ymm0,%ymm11
vmovdqa %ymm12,%ymm0
vpunpckldq %ymm13,%ymm0,%ymm12
vpunpckhdq %ymm13,%ymm0,%ymm13
vmovdqa %ymm14,%ymm0
vpunpckldq %ymm15,%ymm0,%ymm14
vpunpckhdq %ymm15,%ymm0,%ymm15
# interleave 64-bit words in state n, n+2
vmovdqa 0x00(%rsp),%ymm0
vmovdqa 0x40(%rsp),%ymm2
vpunpcklqdq %ymm2,%ymm0,%ymm1
vpunpckhqdq %ymm2,%ymm0,%ymm2
vmovdqa %ymm1,0x00(%rsp)
vmovdqa %ymm2,0x40(%rsp)
vmovdqa 0x20(%rsp),%ymm0
vmovdqa 0x60(%rsp),%ymm2
vpunpcklqdq %ymm2,%ymm0,%ymm1
vpunpckhqdq %ymm2,%ymm0,%ymm2
vmovdqa %ymm1,0x20(%rsp)
vmovdqa %ymm2,0x60(%rsp)
vmovdqa %ymm4,%ymm0
vpunpcklqdq %ymm6,%ymm0,%ymm4
vpunpckhqdq %ymm6,%ymm0,%ymm6
vmovdqa %ymm5,%ymm0
vpunpcklqdq %ymm7,%ymm0,%ymm5
vpunpckhqdq %ymm7,%ymm0,%ymm7
vmovdqa %ymm8,%ymm0
vpunpcklqdq %ymm10,%ymm0,%ymm8
vpunpckhqdq %ymm10,%ymm0,%ymm10
vmovdqa %ymm9,%ymm0
vpunpcklqdq %ymm11,%ymm0,%ymm9
vpunpckhqdq %ymm11,%ymm0,%ymm11
vmovdqa %ymm12,%ymm0
vpunpcklqdq %ymm14,%ymm0,%ymm12
vpunpckhqdq %ymm14,%ymm0,%ymm14
vmovdqa %ymm13,%ymm0
vpunpcklqdq %ymm15,%ymm0,%ymm13
vpunpckhqdq %ymm15,%ymm0,%ymm15
# interleave 128-bit words in state n, n+4
vmovdqa 0x00(%rsp),%ymm0
vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
vmovdqa %ymm1,0x00(%rsp)
vmovdqa 0x20(%rsp),%ymm0
vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
vmovdqa %ymm1,0x20(%rsp)
vmovdqa 0x40(%rsp),%ymm0
vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
vmovdqa %ymm1,0x40(%rsp)
vmovdqa 0x60(%rsp),%ymm0
vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
vmovdqa %ymm1,0x60(%rsp)
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
vmovdqa %ymm0,%ymm8
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
vmovdqa %ymm0,%ymm9
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
vmovdqa %ymm0,%ymm10
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
vmovdqa %ymm0,%ymm11
# xor with corresponding input, write to output
vmovdqa 0x00(%rsp),%ymm0
vpxor 0x0000(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x0000(%rsi)
vmovdqa 0x20(%rsp),%ymm0
vpxor 0x0080(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x0080(%rsi)
vmovdqa 0x40(%rsp),%ymm0
vpxor 0x0040(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x0040(%rsi)
vmovdqa 0x60(%rsp),%ymm0
vpxor 0x00c0(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x00c0(%rsi)
vpxor 0x0100(%rdx),%ymm4,%ymm4
vmovdqu %ymm4,0x0100(%rsi)
vpxor 0x0180(%rdx),%ymm5,%ymm5
vmovdqu %ymm5,0x00180(%rsi)
vpxor 0x0140(%rdx),%ymm6,%ymm6
vmovdqu %ymm6,0x0140(%rsi)
vpxor 0x01c0(%rdx),%ymm7,%ymm7
vmovdqu %ymm7,0x01c0(%rsi)
vpxor 0x0020(%rdx),%ymm8,%ymm8
vmovdqu %ymm8,0x0020(%rsi)
vpxor 0x00a0(%rdx),%ymm9,%ymm9
vmovdqu %ymm9,0x00a0(%rsi)
vpxor 0x0060(%rdx),%ymm10,%ymm10
vmovdqu %ymm10,0x0060(%rsi)
vpxor 0x00e0(%rdx),%ymm11,%ymm11
vmovdqu %ymm11,0x00e0(%rsi)
vpxor 0x0120(%rdx),%ymm12,%ymm12
vmovdqu %ymm12,0x0120(%rsi)
vpxor 0x01a0(%rdx),%ymm13,%ymm13
vmovdqu %ymm13,0x01a0(%rsi)
vpxor 0x0160(%rdx),%ymm14,%ymm14
vmovdqu %ymm14,0x0160(%rsi)
vpxor 0x01e0(%rdx),%ymm15,%ymm15
vmovdqu %ymm15,0x01e0(%rsi)
vzeroupper
lea -8(%r10),%rsp
ret
ENDPROC(chacha20_8block_xor_avx2)

View File

@@ -1,146 +0,0 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <crypto/algapi.h>
#include <crypto/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/fpu/api.h>
#include <asm/simd.h>
#define CHACHA20_STATE_ALIGN 16
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
#ifdef CONFIG_AS_AVX2
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
static bool chacha20_use_avx2;
#endif
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
u8 buf[CHACHA_BLOCK_SIZE];
#ifdef CONFIG_AS_AVX2
if (chacha20_use_avx2) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha20_8block_xor_avx2(state, dst, src);
bytes -= CHACHA_BLOCK_SIZE * 8;
src += CHACHA_BLOCK_SIZE * 8;
dst += CHACHA_BLOCK_SIZE * 8;
state[12] += 8;
}
}
#endif
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
chacha20_4block_xor_ssse3(state, dst, src);
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
while (bytes >= CHACHA_BLOCK_SIZE) {
chacha20_block_xor_ssse3(state, dst, src);
bytes -= CHACHA_BLOCK_SIZE;
src += CHACHA_BLOCK_SIZE;
dst += CHACHA_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
memcpy(buf, src, bytes);
chacha20_block_xor_ssse3(state, buf, buf);
memcpy(dst, buf, bytes);
}
}
static int chacha20_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
u32 *state, state_buf[16 + 2] __aligned(8);
struct skcipher_walk walk;
int err;
BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
return crypto_chacha_crypt(req);
err = skcipher_walk_virt(&walk, req, true);
crypto_chacha_init(state, ctx, walk.iv);
kernel_fpu_begin();
while (walk.nbytes >= CHACHA_BLOCK_SIZE) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
rounddown(walk.nbytes, CHACHA_BLOCK_SIZE));
err = skcipher_walk_done(&walk,
walk.nbytes % CHACHA_BLOCK_SIZE);
}
if (walk.nbytes) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
walk.nbytes);
err = skcipher_walk_done(&walk, 0);
}
kernel_fpu_end();
return err;
}
static struct skcipher_alg alg = {
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-simd",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = chacha20_simd,
.decrypt = chacha20_simd,
};
static int __init chacha20_simd_mod_init(void)
{
if (!boot_cpu_has(X86_FEATURE_SSSE3))
return -ENODEV;
#ifdef CONFIG_AS_AVX2
chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
#endif
return crypto_register_skcipher(&alg);
}
static void __exit chacha20_simd_mod_fini(void)
{
crypto_unregister_skcipher(&alg);
}
module_init(chacha20_simd_mod_init);
module_exit(chacha20_simd_mod_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
MODULE_ALIAS_CRYPTO("chacha20");
MODULE_ALIAS_CRYPTO("chacha20-simd");

View File

@@ -0,0 +1,322 @@
/*
* x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
* including ChaCha20 (RFC7539)
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <crypto/algapi.h>
#include <crypto/internal/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/fpu/api.h>
#include <asm/simd.h>
asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
{
len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
}
static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes, int nrounds)
{
if (IS_ENABLED(CONFIG_AS_AVX512) &&
static_branch_likely(&chacha_use_avx512vl)) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha_8block_xor_avx512vl(state, dst, src, bytes,
nrounds);
bytes -= CHACHA_BLOCK_SIZE * 8;
src += CHACHA_BLOCK_SIZE * 8;
dst += CHACHA_BLOCK_SIZE * 8;
state[12] += 8;
}
if (bytes > CHACHA_BLOCK_SIZE * 4) {
chacha_8block_xor_avx512vl(state, dst, src, bytes,
nrounds);
state[12] += chacha_advance(bytes, 8);
return;
}
if (bytes > CHACHA_BLOCK_SIZE * 2) {
chacha_4block_xor_avx512vl(state, dst, src, bytes,
nrounds);
state[12] += chacha_advance(bytes, 4);
return;
}
if (bytes) {
chacha_2block_xor_avx512vl(state, dst, src, bytes,
nrounds);
state[12] += chacha_advance(bytes, 2);
return;
}
}
if (IS_ENABLED(CONFIG_AS_AVX2) &&
static_branch_likely(&chacha_use_avx2)) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 8;
src += CHACHA_BLOCK_SIZE * 8;
dst += CHACHA_BLOCK_SIZE * 8;
state[12] += 8;
}
if (bytes > CHACHA_BLOCK_SIZE * 4) {
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
state[12] += chacha_advance(bytes, 8);
return;
}
if (bytes > CHACHA_BLOCK_SIZE * 2) {
chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
state[12] += chacha_advance(bytes, 4);
return;
}
if (bytes > CHACHA_BLOCK_SIZE) {
chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
state[12] += chacha_advance(bytes, 2);
return;
}
}
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
if (bytes > CHACHA_BLOCK_SIZE) {
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
state[12] += chacha_advance(bytes, 4);
return;
}
if (bytes) {
chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
state[12]++;
}
}
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
{
if (!static_branch_likely(&chacha_use_simd) || !may_use_simd()) {
hchacha_block_generic(state, stream, nrounds);
} else {
kernel_fpu_begin();
hchacha_block_ssse3(state, stream, nrounds);
kernel_fpu_end();
}
}
EXPORT_SYMBOL(hchacha_block_arch);
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
{
chacha_init_generic(state, key, iv);
}
EXPORT_SYMBOL(chacha_init_arch);
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
int nrounds)
{
if (!static_branch_likely(&chacha_use_simd) || !may_use_simd() ||
bytes <= CHACHA_BLOCK_SIZE)
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
do {
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
kernel_fpu_begin();
chacha_dosimd(state, dst, src, todo, nrounds);
kernel_fpu_end();
bytes -= todo;
src += todo;
dst += todo;
} while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);
static int chacha_simd_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv)
{
u32 state[CHACHA_STATE_WORDS] __aligned(8);
struct skcipher_walk walk;
int err;
err = skcipher_walk_virt(&walk, req, false);
chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
if (!static_branch_likely(&chacha_use_simd) ||
!may_use_simd()) {
chacha_crypt_generic(state, walk.dst.virt.addr,
walk.src.virt.addr, nbytes,
ctx->nrounds);
} else {
kernel_fpu_begin();
chacha_dosimd(state, walk.dst.virt.addr,
walk.src.virt.addr, nbytes,
ctx->nrounds);
kernel_fpu_end();
}
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
static int chacha_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
return chacha_simd_stream_xor(req, ctx, req->iv);
}
static int xchacha_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
u32 state[CHACHA_STATE_WORDS] __aligned(8);
struct chacha_ctx subctx;
u8 real_iv[16];
chacha_init_generic(state, ctx->key, req->iv);
if (req->cryptlen > CHACHA_BLOCK_SIZE && irq_fpu_usable()) {
kernel_fpu_begin();
hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
kernel_fpu_end();
} else {
hchacha_block_generic(state, subctx.key, ctx->nrounds);
}
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
memcpy(&real_iv[8], req->iv + 16, 8);
return chacha_simd_stream_xor(req, &subctx, real_iv);
}
static struct skcipher_alg algs[] = {
{
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-simd",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = chacha_simd,
.decrypt = chacha_simd,
}, {
.base.cra_name = "xchacha20",
.base.cra_driver_name = "xchacha20-simd",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = xchacha_simd,
.decrypt = xchacha_simd,
}, {
.base.cra_name = "xchacha12",
.base.cra_driver_name = "xchacha12-simd",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha12_setkey,
.encrypt = xchacha_simd,
.decrypt = xchacha_simd,
},
};
static int __init chacha_simd_mod_init(void)
{
if (!boot_cpu_has(X86_FEATURE_SSSE3))
return 0;
static_branch_enable(&chacha_use_simd);
if (IS_ENABLED(CONFIG_AS_AVX2) &&
boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
static_branch_enable(&chacha_use_avx2);
if (IS_ENABLED(CONFIG_AS_AVX512) &&
boot_cpu_has(X86_FEATURE_AVX512VL) &&
boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
static_branch_enable(&chacha_use_avx512vl);
}
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
}
static void __exit chacha_simd_mod_fini(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3))
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
}
module_init(chacha_simd_mod_init);
module_exit(chacha_simd_mod_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
MODULE_ALIAS_CRYPTO("chacha20");
MODULE_ALIAS_CRYPTO("chacha20-simd");
MODULE_ALIAS_CRYPTO("xchacha20");
MODULE_ALIAS_CRYPTO("xchacha20-simd");
MODULE_ALIAS_CRYPTO("xchacha12");
MODULE_ALIAS_CRYPTO("xchacha12-simd");

File diff suppressed because it is too large Load Diff

View File

@@ -1,394 +0,0 @@
/*
* Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/linkage.h>
.section .rodata.cst32.ANMASK, "aM", @progbits, 32
.align 32
ANMASK: .octa 0x0000000003ffffff0000000003ffffff
.octa 0x0000000003ffffff0000000003ffffff
.section .rodata.cst32.ORMASK, "aM", @progbits, 32
.align 32
ORMASK: .octa 0x00000000010000000000000001000000
.octa 0x00000000010000000000000001000000
.text
#define h0 0x00(%rdi)
#define h1 0x04(%rdi)
#define h2 0x08(%rdi)
#define h3 0x0c(%rdi)
#define h4 0x10(%rdi)
#define r0 0x00(%rdx)
#define r1 0x04(%rdx)
#define r2 0x08(%rdx)
#define r3 0x0c(%rdx)
#define r4 0x10(%rdx)
#define u0 0x00(%r8)
#define u1 0x04(%r8)
#define u2 0x08(%r8)
#define u3 0x0c(%r8)
#define u4 0x10(%r8)
#define w0 0x14(%r8)
#define w1 0x18(%r8)
#define w2 0x1c(%r8)
#define w3 0x20(%r8)
#define w4 0x24(%r8)
#define y0 0x28(%r8)
#define y1 0x2c(%r8)
#define y2 0x30(%r8)
#define y3 0x34(%r8)
#define y4 0x38(%r8)
#define m %rsi
#define hc0 %ymm0
#define hc1 %ymm1
#define hc2 %ymm2
#define hc3 %ymm3
#define hc4 %ymm4
#define hc0x %xmm0
#define hc1x %xmm1
#define hc2x %xmm2
#define hc3x %xmm3
#define hc4x %xmm4
#define t1 %ymm5
#define t2 %ymm6
#define t1x %xmm5
#define t2x %xmm6
#define ruwy0 %ymm7
#define ruwy1 %ymm8
#define ruwy2 %ymm9
#define ruwy3 %ymm10
#define ruwy4 %ymm11
#define ruwy0x %xmm7
#define ruwy1x %xmm8
#define ruwy2x %xmm9
#define ruwy3x %xmm10
#define ruwy4x %xmm11
#define svxz1 %ymm12
#define svxz2 %ymm13
#define svxz3 %ymm14
#define svxz4 %ymm15
#define d0 %r9
#define d1 %r10
#define d2 %r11
#define d3 %r12
#define d4 %r13
ENTRY(poly1305_4block_avx2)
# %rdi: Accumulator h[5]
# %rsi: 64 byte input block m
# %rdx: Poly1305 key r[5]
# %rcx: Quadblock count
# %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
# This four-block variant uses loop unrolled block processing. It
# requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
# h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
vzeroupper
push %rbx
push %r12
push %r13
# combine r0,u0,w0,y0
vmovd y0,ruwy0x
vmovd w0,t1x
vpunpcklqdq t1,ruwy0,ruwy0
vmovd u0,t1x
vmovd r0,t2x
vpunpcklqdq t2,t1,t1
vperm2i128 $0x20,t1,ruwy0,ruwy0
# combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
vmovd y1,ruwy1x
vmovd w1,t1x
vpunpcklqdq t1,ruwy1,ruwy1
vmovd u1,t1x
vmovd r1,t2x
vpunpcklqdq t2,t1,t1
vperm2i128 $0x20,t1,ruwy1,ruwy1
vpslld $2,ruwy1,svxz1
vpaddd ruwy1,svxz1,svxz1
# combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
vmovd y2,ruwy2x
vmovd w2,t1x
vpunpcklqdq t1,ruwy2,ruwy2
vmovd u2,t1x
vmovd r2,t2x
vpunpcklqdq t2,t1,t1
vperm2i128 $0x20,t1,ruwy2,ruwy2
vpslld $2,ruwy2,svxz2
vpaddd ruwy2,svxz2,svxz2
# combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
vmovd y3,ruwy3x
vmovd w3,t1x
vpunpcklqdq t1,ruwy3,ruwy3
vmovd u3,t1x
vmovd r3,t2x
vpunpcklqdq t2,t1,t1
vperm2i128 $0x20,t1,ruwy3,ruwy3
vpslld $2,ruwy3,svxz3
vpaddd ruwy3,svxz3,svxz3
# combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
vmovd y4,ruwy4x
vmovd w4,t1x
vpunpcklqdq t1,ruwy4,ruwy4
vmovd u4,t1x
vmovd r4,t2x
vpunpcklqdq t2,t1,t1
vperm2i128 $0x20,t1,ruwy4,ruwy4
vpslld $2,ruwy4,svxz4
vpaddd ruwy4,svxz4,svxz4
.Ldoblock4:
# hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
# m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
vmovd 0x00(m),hc0x
vmovd 0x10(m),t1x
vpunpcklqdq t1,hc0,hc0
vmovd 0x20(m),t1x
vmovd 0x30(m),t2x
vpunpcklqdq t2,t1,t1
vperm2i128 $0x20,t1,hc0,hc0
vpand ANMASK(%rip),hc0,hc0
vmovd h0,t1x
vpaddd t1,hc0,hc0
# hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
# (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
vmovd 0x03(m),hc1x
vmovd 0x13(m),t1x
vpunpcklqdq t1,hc1,hc1
vmovd 0x23(m),t1x
vmovd 0x33(m),t2x
vpunpcklqdq t2,t1,t1
vperm2i128 $0x20,t1,hc1,hc1
vpsrld $2,hc1,hc1
vpand ANMASK(%rip),hc1,hc1
vmovd h1,t1x
vpaddd t1,hc1,hc1
# hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
# (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
vmovd 0x06(m),hc2x
vmovd 0x16(m),t1x
vpunpcklqdq t1,hc2,hc2
vmovd 0x26(m),t1x
vmovd 0x36(m),t2x
vpunpcklqdq t2,t1,t1
vperm2i128 $0x20,t1,hc2,hc2
vpsrld $4,hc2,hc2
vpand ANMASK(%rip),hc2,hc2
vmovd h2,t1x
vpaddd t1,hc2,hc2
# hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
# (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
vmovd 0x09(m),hc3x
vmovd 0x19(m),t1x
vpunpcklqdq t1,hc3,hc3
vmovd 0x29(m),t1x
vmovd 0x39(m),t2x
vpunpcklqdq t2,t1,t1
vperm2i128 $0x20,t1,hc3,hc3
vpsrld $6,hc3,hc3
vpand ANMASK(%rip),hc3,hc3
vmovd h3,t1x
vpaddd t1,hc3,hc3
# hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
# (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
vmovd 0x0c(m),hc4x
vmovd 0x1c(m),t1x
vpunpcklqdq t1,hc4,hc4
vmovd 0x2c(m),t1x
vmovd 0x3c(m),t2x
vpunpcklqdq t2,t1,t1
vperm2i128 $0x20,t1,hc4,hc4
vpsrld $8,hc4,hc4
vpor ORMASK(%rip),hc4,hc4
vmovd h4,t1x
vpaddd t1,hc4,hc4
# t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
vpmuludq hc0,ruwy0,t1
# t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
vpmuludq hc1,svxz4,t2
vpaddq t2,t1,t1
# t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
vpmuludq hc2,svxz3,t2
vpaddq t2,t1,t1
# t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
vpmuludq hc3,svxz2,t2
vpaddq t2,t1,t1
# t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
vpmuludq hc4,svxz1,t2
vpaddq t2,t1,t1
# d0 = t1[0] + t1[1] + t[2] + t[3]
vpermq $0xee,t1,t2
vpaddq t2,t1,t1
vpsrldq $8,t1,t2
vpaddq t2,t1,t1
vmovq t1x,d0
# t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
vpmuludq hc0,ruwy1,t1
# t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
vpmuludq hc1,ruwy0,t2
vpaddq t2,t1,t1
# t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
vpmuludq hc2,svxz4,t2
vpaddq t2,t1,t1
# t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
vpmuludq hc3,svxz3,t2
vpaddq t2,t1,t1
# t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
vpmuludq hc4,svxz2,t2
vpaddq t2,t1,t1
# d1 = t1[0] + t1[1] + t1[3] + t1[4]
vpermq $0xee,t1,t2
vpaddq t2,t1,t1
vpsrldq $8,t1,t2
vpaddq t2,t1,t1
vmovq t1x,d1
# t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
vpmuludq hc0,ruwy2,t1
# t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
vpmuludq hc1,ruwy1,t2
vpaddq t2,t1,t1
# t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
vpmuludq hc2,ruwy0,t2
vpaddq t2,t1,t1
# t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
vpmuludq hc3,svxz4,t2
vpaddq t2,t1,t1
# t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
vpmuludq hc4,svxz3,t2
vpaddq t2,t1,t1
# d2 = t1[0] + t1[1] + t1[2] + t1[3]
vpermq $0xee,t1,t2
vpaddq t2,t1,t1
vpsrldq $8,t1,t2
vpaddq t2,t1,t1
vmovq t1x,d2
# t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
vpmuludq hc0,ruwy3,t1
# t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
vpmuludq hc1,ruwy2,t2
vpaddq t2,t1,t1
# t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
vpmuludq hc2,ruwy1,t2
vpaddq t2,t1,t1
# t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
vpmuludq hc3,ruwy0,t2
vpaddq t2,t1,t1
# t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
vpmuludq hc4,svxz4,t2
vpaddq t2,t1,t1
# d3 = t1[0] + t1[1] + t1[2] + t1[3]
vpermq $0xee,t1,t2
vpaddq t2,t1,t1
vpsrldq $8,t1,t2
vpaddq t2,t1,t1
vmovq t1x,d3
# t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
vpmuludq hc0,ruwy4,t1
# t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
vpmuludq hc1,ruwy3,t2
vpaddq t2,t1,t1
# t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
vpmuludq hc2,ruwy2,t2
vpaddq t2,t1,t1
# t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
vpmuludq hc3,ruwy1,t2
vpaddq t2,t1,t1
# t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
vpmuludq hc4,ruwy0,t2
vpaddq t2,t1,t1
# d4 = t1[0] + t1[1] + t1[2] + t1[3]
vpermq $0xee,t1,t2
vpaddq t2,t1,t1
vpsrldq $8,t1,t2
vpaddq t2,t1,t1
vmovq t1x,d4
# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
# amount. Careful: we must not assume the carry bits 'd0 >> 26',
# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
# integers. It's true in a single-block implementation, but not here.
# d1 += d0 >> 26
mov d0,%rax
shr $26,%rax
add %rax,d1
# h0 = d0 & 0x3ffffff
mov d0,%rbx
and $0x3ffffff,%ebx
# d2 += d1 >> 26
mov d1,%rax
shr $26,%rax
add %rax,d2
# h1 = d1 & 0x3ffffff
mov d1,%rax
and $0x3ffffff,%eax
mov %eax,h1
# d3 += d2 >> 26
mov d2,%rax
shr $26,%rax
add %rax,d3
# h2 = d2 & 0x3ffffff
mov d2,%rax
and $0x3ffffff,%eax
mov %eax,h2
# d4 += d3 >> 26
mov d3,%rax
shr $26,%rax
add %rax,d4
# h3 = d3 & 0x3ffffff
mov d3,%rax
and $0x3ffffff,%eax
mov %eax,h3
# h0 += (d4 >> 26) * 5
mov d4,%rax
shr $26,%rax
lea (%rax,%rax,4),%rax
add %rax,%rbx
# h4 = d4 & 0x3ffffff
mov d4,%rax
and $0x3ffffff,%eax
mov %eax,h4
# h1 += h0 >> 26
mov %rbx,%rax
shr $26,%rax
add %eax,h1
# h0 = h0 & 0x3ffffff
andl $0x3ffffff,%ebx
mov %ebx,h0
add $0x40,m
dec %rcx
jnz .Ldoblock4
vzeroupper
pop %r13
pop %r12
pop %rbx
ret
ENDPROC(poly1305_4block_avx2)

View File

@@ -1,590 +0,0 @@
/*
* Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/linkage.h>
.section .rodata.cst16.ANMASK, "aM", @progbits, 16
.align 16
ANMASK: .octa 0x0000000003ffffff0000000003ffffff
.section .rodata.cst16.ORMASK, "aM", @progbits, 16
.align 16
ORMASK: .octa 0x00000000010000000000000001000000
.text
#define h0 0x00(%rdi)
#define h1 0x04(%rdi)
#define h2 0x08(%rdi)
#define h3 0x0c(%rdi)
#define h4 0x10(%rdi)
#define r0 0x00(%rdx)
#define r1 0x04(%rdx)
#define r2 0x08(%rdx)
#define r3 0x0c(%rdx)
#define r4 0x10(%rdx)
#define s1 0x00(%rsp)
#define s2 0x04(%rsp)
#define s3 0x08(%rsp)
#define s4 0x0c(%rsp)
#define m %rsi
#define h01 %xmm0
#define h23 %xmm1
#define h44 %xmm2
#define t1 %xmm3
#define t2 %xmm4
#define t3 %xmm5
#define t4 %xmm6
#define mask %xmm7
#define d0 %r8
#define d1 %r9
#define d2 %r10
#define d3 %r11
#define d4 %r12
ENTRY(poly1305_block_sse2)
# %rdi: Accumulator h[5]
# %rsi: 16 byte input block m
# %rdx: Poly1305 key r[5]
# %rcx: Block count
# This single block variant tries to improve performance by doing two
# multiplications in parallel using SSE instructions. There is quite
# some quardword packing involved, hence the speedup is marginal.
push %rbx
push %r12
sub $0x10,%rsp
# s1..s4 = r1..r4 * 5
mov r1,%eax
lea (%eax,%eax,4),%eax
mov %eax,s1
mov r2,%eax
lea (%eax,%eax,4),%eax
mov %eax,s2
mov r3,%eax
lea (%eax,%eax,4),%eax
mov %eax,s3
mov r4,%eax
lea (%eax,%eax,4),%eax
mov %eax,s4
movdqa ANMASK(%rip),mask
.Ldoblock:
# h01 = [0, h1, 0, h0]
# h23 = [0, h3, 0, h2]
# h44 = [0, h4, 0, h4]
movd h0,h01
movd h1,t1
movd h2,h23
movd h3,t2
movd h4,h44
punpcklqdq t1,h01
punpcklqdq t2,h23
punpcklqdq h44,h44
# h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
movd 0x00(m),t1
movd 0x03(m),t2
psrld $2,t2
punpcklqdq t2,t1
pand mask,t1
paddd t1,h01
# h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
movd 0x06(m),t1
movd 0x09(m),t2
psrld $4,t1
psrld $6,t2
punpcklqdq t2,t1
pand mask,t1
paddd t1,h23
# h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
mov 0x0c(m),%eax
shr $8,%eax
or $0x01000000,%eax
movd %eax,t1
pshufd $0xc4,t1,t1
paddd t1,h44
# t1[0] = h0 * r0 + h2 * s3
# t1[1] = h1 * s4 + h3 * s2
movd r0,t1
movd s4,t2
punpcklqdq t2,t1
pmuludq h01,t1
movd s3,t2
movd s2,t3
punpcklqdq t3,t2
pmuludq h23,t2
paddq t2,t1
# t2[0] = h0 * r1 + h2 * s4
# t2[1] = h1 * r0 + h3 * s3
movd r1,t2
movd r0,t3
punpcklqdq t3,t2
pmuludq h01,t2
movd s4,t3
movd s3,t4
punpcklqdq t4,t3
pmuludq h23,t3
paddq t3,t2
# t3[0] = h4 * s1
# t3[1] = h4 * s2
movd s1,t3
movd s2,t4
punpcklqdq t4,t3
pmuludq h44,t3
# d0 = t1[0] + t1[1] + t3[0]
# d1 = t2[0] + t2[1] + t3[1]
movdqa t1,t4
punpcklqdq t2,t4
punpckhqdq t2,t1
paddq t4,t1
paddq t3,t1
movq t1,d0
psrldq $8,t1
movq t1,d1
# t1[0] = h0 * r2 + h2 * r0
# t1[1] = h1 * r1 + h3 * s4
movd r2,t1
movd r1,t2
punpcklqdq t2,t1
pmuludq h01,t1
movd r0,t2
movd s4,t3
punpcklqdq t3,t2
pmuludq h23,t2
paddq t2,t1
# t2[0] = h0 * r3 + h2 * r1
# t2[1] = h1 * r2 + h3 * r0
movd r3,t2
movd r2,t3
punpcklqdq t3,t2
pmuludq h01,t2
movd r1,t3
movd r0,t4
punpcklqdq t4,t3
pmuludq h23,t3
paddq t3,t2
# t3[0] = h4 * s3
# t3[1] = h4 * s4
movd s3,t3
movd s4,t4
punpcklqdq t4,t3
pmuludq h44,t3
# d2 = t1[0] + t1[1] + t3[0]
# d3 = t2[0] + t2[1] + t3[1]
movdqa t1,t4
punpcklqdq t2,t4
punpckhqdq t2,t1
paddq t4,t1
paddq t3,t1
movq t1,d2
psrldq $8,t1
movq t1,d3
# t1[0] = h0 * r4 + h2 * r2
# t1[1] = h1 * r3 + h3 * r1
movd r4,t1
movd r3,t2
punpcklqdq t2,t1
pmuludq h01,t1
movd r2,t2
movd r1,t3
punpcklqdq t3,t2
pmuludq h23,t2
paddq t2,t1
# t3[0] = h4 * r0
movd r0,t3
pmuludq h44,t3
# d4 = t1[0] + t1[1] + t3[0]
movdqa t1,t4
psrldq $8,t4
paddq t4,t1
paddq t3,t1
movq t1,d4
# d1 += d0 >> 26
mov d0,%rax
shr $26,%rax
add %rax,d1
# h0 = d0 & 0x3ffffff
mov d0,%rbx
and $0x3ffffff,%ebx
# d2 += d1 >> 26
mov d1,%rax
shr $26,%rax
add %rax,d2
# h1 = d1 & 0x3ffffff
mov d1,%rax
and $0x3ffffff,%eax
mov %eax,h1
# d3 += d2 >> 26
mov d2,%rax
shr $26,%rax
add %rax,d3
# h2 = d2 & 0x3ffffff
mov d2,%rax
and $0x3ffffff,%eax
mov %eax,h2
# d4 += d3 >> 26
mov d3,%rax
shr $26,%rax
add %rax,d4
# h3 = d3 & 0x3ffffff
mov d3,%rax
and $0x3ffffff,%eax
mov %eax,h3
# h0 += (d4 >> 26) * 5
mov d4,%rax
shr $26,%rax
lea (%rax,%rax,4),%rax
add %rax,%rbx
# h4 = d4 & 0x3ffffff
mov d4,%rax
and $0x3ffffff,%eax
mov %eax,h4
# h1 += h0 >> 26
mov %rbx,%rax
shr $26,%rax
add %eax,h1
# h0 = h0 & 0x3ffffff
andl $0x3ffffff,%ebx
mov %ebx,h0
add $0x10,m
dec %rcx
jnz .Ldoblock
add $0x10,%rsp
pop %r12
pop %rbx
ret
ENDPROC(poly1305_block_sse2)
#define u0 0x00(%r8)
#define u1 0x04(%r8)
#define u2 0x08(%r8)
#define u3 0x0c(%r8)
#define u4 0x10(%r8)
#define hc0 %xmm0
#define hc1 %xmm1
#define hc2 %xmm2
#define hc3 %xmm5
#define hc4 %xmm6
#define ru0 %xmm7
#define ru1 %xmm8
#define ru2 %xmm9
#define ru3 %xmm10
#define ru4 %xmm11
#define sv1 %xmm12
#define sv2 %xmm13
#define sv3 %xmm14
#define sv4 %xmm15
#undef d0
#define d0 %r13
ENTRY(poly1305_2block_sse2)
# %rdi: Accumulator h[5]
# %rsi: 16 byte input block m
# %rdx: Poly1305 key r[5]
# %rcx: Doubleblock count
# %r8: Poly1305 derived key r^2 u[5]
# This two-block variant further improves performance by using loop
# unrolled block processing. This is more straight forward and does
# less byte shuffling, but requires a second Poly1305 key r^2:
# h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
push %rbx
push %r12
push %r13
# combine r0,u0
movd u0,ru0
movd r0,t1
punpcklqdq t1,ru0
# combine r1,u1 and s1=r1*5,v1=u1*5
movd u1,ru1
movd r1,t1
punpcklqdq t1,ru1
movdqa ru1,sv1
pslld $2,sv1
paddd ru1,sv1
# combine r2,u2 and s2=r2*5,v2=u2*5
movd u2,ru2
movd r2,t1
punpcklqdq t1,ru2
movdqa ru2,sv2
pslld $2,sv2
paddd ru2,sv2
# combine r3,u3 and s3=r3*5,v3=u3*5
movd u3,ru3
movd r3,t1
punpcklqdq t1,ru3
movdqa ru3,sv3
pslld $2,sv3
paddd ru3,sv3
# combine r4,u4 and s4=r4*5,v4=u4*5
movd u4,ru4
movd r4,t1
punpcklqdq t1,ru4
movdqa ru4,sv4
pslld $2,sv4
paddd ru4,sv4
.Ldoblock2:
# hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
movd 0x00(m),hc0
movd 0x10(m),t1
punpcklqdq t1,hc0
pand ANMASK(%rip),hc0
movd h0,t1
paddd t1,hc0
# hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
movd 0x03(m),hc1
movd 0x13(m),t1
punpcklqdq t1,hc1
psrld $2,hc1
pand ANMASK(%rip),hc1
movd h1,t1
paddd t1,hc1
# hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
movd 0x06(m),hc2
movd 0x16(m),t1
punpcklqdq t1,hc2
psrld $4,hc2
pand ANMASK(%rip),hc2
movd h2,t1
paddd t1,hc2
# hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
movd 0x09(m),hc3
movd 0x19(m),t1
punpcklqdq t1,hc3
psrld $6,hc3
pand ANMASK(%rip),hc3
movd h3,t1
paddd t1,hc3
# hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
movd 0x0c(m),hc4
movd 0x1c(m),t1
punpcklqdq t1,hc4
psrld $8,hc4
por ORMASK(%rip),hc4
movd h4,t1
paddd t1,hc4
# t1 = [ hc0[1] * r0, hc0[0] * u0 ]
movdqa ru0,t1
pmuludq hc0,t1
# t1 += [ hc1[1] * s4, hc1[0] * v4 ]
movdqa sv4,t2
pmuludq hc1,t2
paddq t2,t1
# t1 += [ hc2[1] * s3, hc2[0] * v3 ]
movdqa sv3,t2
pmuludq hc2,t2
paddq t2,t1
# t1 += [ hc3[1] * s2, hc3[0] * v2 ]
movdqa sv2,t2
pmuludq hc3,t2
paddq t2,t1
# t1 += [ hc4[1] * s1, hc4[0] * v1 ]
movdqa sv1,t2
pmuludq hc4,t2
paddq t2,t1
# d0 = t1[0] + t1[1]
movdqa t1,t2
psrldq $8,t2
paddq t2,t1
movq t1,d0
# t1 = [ hc0[1] * r1, hc0[0] * u1 ]
movdqa ru1,t1
pmuludq hc0,t1
# t1 += [ hc1[1] * r0, hc1[0] * u0 ]
movdqa ru0,t2
pmuludq hc1,t2
paddq t2,t1
# t1 += [ hc2[1] * s4, hc2[0] * v4 ]
movdqa sv4,t2
pmuludq hc2,t2
paddq t2,t1
# t1 += [ hc3[1] * s3, hc3[0] * v3 ]
movdqa sv3,t2
pmuludq hc3,t2
paddq t2,t1
# t1 += [ hc4[1] * s2, hc4[0] * v2 ]
movdqa sv2,t2
pmuludq hc4,t2
paddq t2,t1
# d1 = t1[0] + t1[1]
movdqa t1,t2
psrldq $8,t2
paddq t2,t1
movq t1,d1
# t1 = [ hc0[1] * r2, hc0[0] * u2 ]
movdqa ru2,t1
pmuludq hc0,t1
# t1 += [ hc1[1] * r1, hc1[0] * u1 ]
movdqa ru1,t2
pmuludq hc1,t2
paddq t2,t1
# t1 += [ hc2[1] * r0, hc2[0] * u0 ]
movdqa ru0,t2
pmuludq hc2,t2
paddq t2,t1
# t1 += [ hc3[1] * s4, hc3[0] * v4 ]
movdqa sv4,t2
pmuludq hc3,t2
paddq t2,t1
# t1 += [ hc4[1] * s3, hc4[0] * v3 ]
movdqa sv3,t2
pmuludq hc4,t2
paddq t2,t1
# d2 = t1[0] + t1[1]
movdqa t1,t2
psrldq $8,t2
paddq t2,t1
movq t1,d2
# t1 = [ hc0[1] * r3, hc0[0] * u3 ]
movdqa ru3,t1
pmuludq hc0,t1
# t1 += [ hc1[1] * r2, hc1[0] * u2 ]
movdqa ru2,t2
pmuludq hc1,t2
paddq t2,t1
# t1 += [ hc2[1] * r1, hc2[0] * u1 ]
movdqa ru1,t2
pmuludq hc2,t2
paddq t2,t1
# t1 += [ hc3[1] * r0, hc3[0] * u0 ]
movdqa ru0,t2
pmuludq hc3,t2
paddq t2,t1
# t1 += [ hc4[1] * s4, hc4[0] * v4 ]
movdqa sv4,t2
pmuludq hc4,t2
paddq t2,t1
# d3 = t1[0] + t1[1]
movdqa t1,t2
psrldq $8,t2
paddq t2,t1
movq t1,d3
# t1 = [ hc0[1] * r4, hc0[0] * u4 ]
movdqa ru4,t1
pmuludq hc0,t1
# t1 += [ hc1[1] * r3, hc1[0] * u3 ]
movdqa ru3,t2
pmuludq hc1,t2
paddq t2,t1
# t1 += [ hc2[1] * r2, hc2[0] * u2 ]
movdqa ru2,t2
pmuludq hc2,t2
paddq t2,t1
# t1 += [ hc3[1] * r1, hc3[0] * u1 ]
movdqa ru1,t2
pmuludq hc3,t2
paddq t2,t1
# t1 += [ hc4[1] * r0, hc4[0] * u0 ]
movdqa ru0,t2
pmuludq hc4,t2
paddq t2,t1
# d4 = t1[0] + t1[1]
movdqa t1,t2
psrldq $8,t2
paddq t2,t1
movq t1,d4
# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
# amount. Careful: we must not assume the carry bits 'd0 >> 26',
# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
# integers. It's true in a single-block implementation, but not here.
# d1 += d0 >> 26
mov d0,%rax
shr $26,%rax
add %rax,d1
# h0 = d0 & 0x3ffffff
mov d0,%rbx
and $0x3ffffff,%ebx
# d2 += d1 >> 26
mov d1,%rax
shr $26,%rax
add %rax,d2
# h1 = d1 & 0x3ffffff
mov d1,%rax
and $0x3ffffff,%eax
mov %eax,h1
# d3 += d2 >> 26
mov d2,%rax
shr $26,%rax
add %rax,d3
# h2 = d2 & 0x3ffffff
mov d2,%rax
and $0x3ffffff,%eax
mov %eax,h2
# d4 += d3 >> 26
mov d3,%rax
shr $26,%rax
add %rax,d4
# h3 = d3 & 0x3ffffff
mov d3,%rax
and $0x3ffffff,%eax
mov %eax,h3
# h0 += (d4 >> 26) * 5
mov d4,%rax
shr $26,%rax
lea (%rax,%rax,4),%rax
add %rax,%rbx
# h4 = d4 & 0x3ffffff
mov d4,%rax
and $0x3ffffff,%eax
mov %eax,h4
# h1 += h0 >> 26
mov %rbx,%rax
shr $26,%rax
add %eax,h1
# h0 = h0 & 0x3ffffff
andl $0x3ffffff,%ebx
mov %ebx,h0
add $0x20,m
dec %rcx
jnz .Ldoblock2
pop %r13
pop %r12
pop %rbx
ret
ENDPROC(poly1305_2block_sse2)

File diff suppressed because it is too large Load Diff

View File

@@ -1,135 +1,175 @@
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
* Poly1305 authenticator algorithm, RFC7539, SIMD glue code
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <crypto/algapi.h>
#include <crypto/internal/hash.h>
#include <crypto/poly1305.h>
#include <crypto/internal/poly1305.h>
#include <linux/crypto.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/fpu/api.h>
#include <asm/simd.h>
#include <asm/intel-family.h>
struct poly1305_simd_desc_ctx {
struct poly1305_desc_ctx base;
/* derived key u set? */
bool uset;
#ifdef CONFIG_AS_AVX2
/* derived keys r^3, r^4 set? */
bool wset;
#endif
/* derived Poly1305 key r^2 */
u32 u[5];
/* ... silently appended r^3 and r^4 when using AVX2 */
asmlinkage void poly1305_init_x86_64(void *ctx,
const u8 key[POLY1305_KEY_SIZE]);
asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
const size_t len, const u32 padbit);
asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
const u32 nonce[4]);
asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
const u32 nonce[4]);
asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
const u32 padbit);
asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
const u32 padbit);
asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
const size_t len, const u32 padbit);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
struct poly1305_arch_internal {
union {
struct {
u32 h[5];
u32 is_base2_26;
};
u64 hs[3];
};
u64 r[2];
u64 pad;
struct { u32 r2, r1, r4, r3; } rn[9];
};
asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
const u32 *r, unsigned int blocks);
asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
unsigned int blocks, const u32 *u);
#ifdef CONFIG_AS_AVX2
asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
unsigned int blocks, const u32 *u);
static bool poly1305_use_avx2;
#endif
static int poly1305_simd_init(struct shash_desc *desc)
/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
* the unfortunate situation of using AVX and then having to go back to scalar
* -- because the user is silly and has called the update function from two
* separate contexts -- then we need to convert back to the original base before
* proceeding. It is possible to reason that the initial reduction below is
* sufficient given the implementation invariants. However, for an avoidance of
* doubt and because this is not performance critical, we do the full reduction
* anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
*/
static void convert_to_base2_64(void *ctx)
{
struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc);
struct poly1305_arch_internal *state = ctx;
u32 cy;
sctx->uset = false;
#ifdef CONFIG_AS_AVX2
sctx->wset = false;
#endif
if (!state->is_base2_26)
return;
return crypto_poly1305_init(desc);
cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
state->hs[2] = state->h[4] >> 24;
#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
state->hs[2] &= 3;
state->hs[0] += cy;
state->hs[1] += (cy = ULT(state->hs[0], cy));
state->hs[2] += ULT(state->hs[1], cy);
#undef ULT
state->is_base2_26 = 0;
}
static void poly1305_simd_mult(u32 *a, const u32 *b)
static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
{
u8 m[POLY1305_BLOCK_SIZE];
memset(m, 0, sizeof(m));
/* The poly1305 block function adds a hi-bit to the accumulator which
* we don't need for key multiplication; compensate for it. */
a[4] -= 1 << 24;
poly1305_block_sse2(a, m, b, 1);
poly1305_init_x86_64(ctx, key);
}
static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
const u8 *src, unsigned int srclen)
static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
const u32 padbit)
{
struct poly1305_simd_desc_ctx *sctx;
unsigned int blocks, datalen;
struct poly1305_arch_internal *state = ctx;
BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base));
sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base);
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
SZ_4K % POLY1305_BLOCK_SIZE);
if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
(len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
!may_use_simd()) {
convert_to_base2_64(ctx);
poly1305_blocks_x86_64(ctx, inp, len, padbit);
return;
}
do {
const size_t bytes = min_t(size_t, len, SZ_4K);
kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
poly1305_blocks_avx512(ctx, inp, bytes, padbit);
else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
poly1305_blocks_avx2(ctx, inp, bytes, padbit);
else
poly1305_blocks_avx(ctx, inp, bytes, padbit);
kernel_fpu_end();
len -= bytes;
inp += bytes;
} while (len);
}
static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
const u32 nonce[4])
{
if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx))
poly1305_emit_x86_64(ctx, mac, nonce);
else
poly1305_emit_avx(ctx, mac, nonce);
}
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
{
poly1305_simd_init(&dctx->h, key);
dctx->s[0] = get_unaligned_le32(&key[16]);
dctx->s[1] = get_unaligned_le32(&key[20]);
dctx->s[2] = get_unaligned_le32(&key[24]);
dctx->s[3] = get_unaligned_le32(&key[28]);
dctx->buflen = 0;
dctx->sset = true;
}
EXPORT_SYMBOL(poly1305_init_arch);
static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
const u8 *inp, unsigned int len)
{
unsigned int acc = 0;
if (unlikely(!dctx->sset)) {
datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
src += srclen - datalen;
srclen = datalen;
}
#ifdef CONFIG_AS_AVX2
if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
if (unlikely(!sctx->wset)) {
if (!sctx->uset) {
memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
poly1305_simd_mult(sctx->u, dctx->r.r);
sctx->uset = true;
}
memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
poly1305_simd_mult(sctx->u + 5, dctx->r.r);
memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
poly1305_simd_mult(sctx->u + 10, dctx->r.r);
sctx->wset = true;
if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
poly1305_simd_init(&dctx->h, inp);
inp += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
acc += POLY1305_BLOCK_SIZE;
dctx->rset = 1;
}
blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
sctx->u);
src += POLY1305_BLOCK_SIZE * 4 * blocks;
srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
}
#endif
if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
if (unlikely(!sctx->uset)) {
memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
poly1305_simd_mult(sctx->u, dctx->r.r);
sctx->uset = true;
if (len >= POLY1305_BLOCK_SIZE) {
dctx->s[0] = get_unaligned_le32(&inp[0]);
dctx->s[1] = get_unaligned_le32(&inp[4]);
dctx->s[2] = get_unaligned_le32(&inp[8]);
dctx->s[3] = get_unaligned_le32(&inp[12]);
inp += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
acc += POLY1305_BLOCK_SIZE;
dctx->sset = true;
}
blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
sctx->u);
src += POLY1305_BLOCK_SIZE * 2 * blocks;
srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
}
if (srclen >= POLY1305_BLOCK_SIZE) {
poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
srclen -= POLY1305_BLOCK_SIZE;
}
return srclen;
return acc;
}
static int poly1305_simd_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int srclen)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
unsigned int bytes;
/* kernel_fpu_begin/end is costly, use fallback for small updates */
if (srclen <= 288 || !may_use_simd())
return crypto_poly1305_update(desc, src, srclen);
kernel_fpu_begin();
unsigned int bytes, used;
if (unlikely(dctx->buflen)) {
bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
@@ -139,34 +179,76 @@ static int poly1305_simd_update(struct shash_desc *desc,
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
poly1305_simd_blocks(dctx, dctx->buf,
POLY1305_BLOCK_SIZE);
if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
dctx->buflen = 0;
}
}
if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
bytes = poly1305_simd_blocks(dctx, src, srclen);
src += srclen - bytes;
srclen = bytes;
bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
srclen -= bytes;
used = crypto_poly1305_setdctxkey(dctx, src, bytes);
if (likely(bytes - used))
poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
src += bytes;
}
kernel_fpu_end();
if (unlikely(srclen)) {
dctx->buflen = srclen;
memcpy(dctx->buf, src, srclen);
}
}
EXPORT_SYMBOL(poly1305_update_arch);
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
{
if (unlikely(dctx->buflen)) {
dctx->buf[dctx->buflen++] = 1;
memset(dctx->buf + dctx->buflen, 0,
POLY1305_BLOCK_SIZE - dctx->buflen);
poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
}
poly1305_simd_emit(&dctx->h, dst, dctx->s);
*dctx = (struct poly1305_desc_ctx){};
}
EXPORT_SYMBOL(poly1305_final_arch);
static int crypto_poly1305_init(struct shash_desc *desc)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
*dctx = (struct poly1305_desc_ctx){};
return 0;
}
static int crypto_poly1305_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
poly1305_update_arch(dctx, src, srclen);
return 0;
}
static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
if (unlikely(!dctx->sset))
return -ENOKEY;
poly1305_final_arch(dctx, dst);
return 0;
}
static struct shash_alg alg = {
.digestsize = POLY1305_DIGEST_SIZE,
.init = poly1305_simd_init,
.update = poly1305_simd_update,
.init = crypto_poly1305_init,
.update = crypto_poly1305_update,
.final = crypto_poly1305_final,
.descsize = sizeof(struct poly1305_simd_desc_ctx),
.descsize = sizeof(struct poly1305_desc_ctx),
.base = {
.cra_name = "poly1305",
.cra_driver_name = "poly1305-simd",
@@ -178,30 +260,33 @@ static struct shash_alg alg = {
static int __init poly1305_simd_mod_init(void)
{
if (!boot_cpu_has(X86_FEATURE_XMM2))
return -ENODEV;
#ifdef CONFIG_AS_AVX2
poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
alg.descsize = sizeof(struct poly1305_simd_desc_ctx);
if (poly1305_use_avx2)
alg.descsize += 10 * sizeof(u32);
#endif
return crypto_register_shash(&alg);
if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
static_branch_enable(&poly1305_use_avx);
if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
static_branch_enable(&poly1305_use_avx2);
if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
/* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
static_branch_enable(&poly1305_use_avx512);
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
}
static void __exit poly1305_simd_mod_exit(void)
{
crypto_unregister_shash(&alg);
if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
crypto_unregister_shash(&alg);
}
module_init(poly1305_simd_mod_init);
module_exit(poly1305_simd_mod_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
MODULE_DESCRIPTION("Poly1305 authenticator");
MODULE_ALIAS_CRYPTO("poly1305");
MODULE_ALIAS_CRYPTO("poly1305-simd");

View File

@@ -387,7 +387,7 @@ static __init int _init_events_attrs(void)
while (amd_iommu_v2_event_descs[i].attr.attr.name)
i++;
attrs = kcalloc(i + 1, sizeof(struct attribute **), GFP_KERNEL);
attrs = kcalloc(i + 1, sizeof(*attrs), GFP_KERNEL);
if (!attrs)
return -ENOMEM;

View File

@@ -249,9 +249,9 @@ static void __init fpu__init_system_ctx_switch(void)
*/
static void __init fpu__init_parse_early_param(void)
{
char arg[32];
char arg[128];
char *argptr = arg;
int bit;
int arglen, res, bit;
if (cmdline_find_option_bool(boot_command_line, "no387"))
setup_clear_cpu_cap(X86_FEATURE_FPU);
@@ -271,12 +271,26 @@ static void __init fpu__init_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
if (cmdline_find_option(boot_command_line, "clearcpuid", arg,
sizeof(arg)) &&
get_option(&argptr, &bit) &&
bit >= 0 &&
bit < NCAPINTS * 32)
setup_clear_cpu_cap(bit);
arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
if (arglen <= 0)
return;
pr_info("Clearing CPUID bits:");
do {
res = get_option(&argptr, &bit);
if (res == 0 || res == 3)
break;
/* If the argument was too long, the last bit may be cut off */
if (res == 1 && arglen >= sizeof(arg))
break;
if (bit >= 0 && bit < NCAPINTS * 32) {
pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
setup_clear_cpu_cap(bit);
}
} while (res == 2);
pr_cont("\n");
}
/*

View File

@@ -104,7 +104,6 @@ fs_initcall(nmi_warning_debugfs);
static void nmi_check_duration(struct nmiaction *action, u64 duration)
{
u64 whole_msecs = READ_ONCE(action->max_duration);
int remainder_ns, decimal_msecs;
if (duration < nmi_longest_ns || duration < action->max_duration)
@@ -112,12 +111,12 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration)
action->max_duration = duration;
remainder_ns = do_div(whole_msecs, (1000 * 1000));
remainder_ns = do_div(duration, (1000 * 1000));
decimal_msecs = remainder_ns / 1000;
printk_ratelimited(KERN_INFO
"INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
action->handler, whole_msecs, decimal_msecs);
action->handler, duration, decimal_msecs);
}
static int nmi_handle(unsigned int type, struct pt_regs *regs)

View File

@@ -3561,7 +3561,7 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt)
u64 tsc_aux = 0;
if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
return emulate_gp(ctxt, 0);
return emulate_ud(ctxt);
ctxt->dst.val = tsc_aux;
return X86EMUL_CONTINUE;
}

View File

@@ -6225,6 +6225,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
cond_resched_lock(&kvm->mmu_lock);
}
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);

View File

@@ -5380,6 +5380,7 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
* - Tell IOMMU to use legacy mode for this interrupt.
* - Retrieve ga_tag of prior interrupt remapping data.
*/
pi.prev_ga_tag = 0;
pi.is_guest_mode = false;
ret = irq_set_vcpu_affinity(host_irq, &pi);

View File

@@ -2129,11 +2129,10 @@ static void handle_bad_sector(struct bio *bio, sector_t maxsector)
{
char b[BDEVNAME_SIZE];
printk(KERN_INFO "attempt to access beyond end of device\n");
printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n",
bio_devname(bio, b), bio->bi_opf,
(unsigned long long)bio_end_sector(bio),
(long long)maxsector);
pr_info_ratelimited("attempt to access beyond end of device\n"
"%s: rw=%d, want=%llu, limit=%llu\n",
bio_devname(bio, b), bio->bi_opf,
bio_end_sector(bio), maxsector);
}
#ifdef CONFIG_FAIL_MAKE_REQUEST

View File

@@ -3,10 +3,11 @@ ARCH=arm64
CLANG_TRIPLE=aarch64-linux-gnu-
CROSS_COMPILE=aarch64-linux-androidkernel-
CROSS_COMPILE_COMPAT=arm-linux-androidkernel-
LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin
LINUX_GCC_CROSS_COMPILE_COMPAT_PREBUILTS_BIN=prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin/
LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gas/linux-x86
LINUX_GCC_CROSS_COMPILE_COMPAT_PREBUILTS_BIN=prebuilts/gas/linux-x86
FILES="
arch/arm64/boot/Image
arch/arm64/boot/Image.gz
vmlinux
System.map

View File

@@ -2,7 +2,7 @@ ARCH=arm
CLANG_TRIPLE=arm-linux-gnueabi-
CROSS_COMPILE=arm-linux-androidkernel-
LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin
LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gas/linux-x86
FILES="
arch/arm/boot/Image.gz

View File

@@ -3,7 +3,7 @@ KMI_GENERATION=0
LLVM=1
DEPMOD=depmod
CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r383902/bin
CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r399163b/bin
BUILDTOOLS_PREBUILT_BIN=build/build-tools/path/linux-x86
EXTRA_CMDS=''

View File

@@ -2,7 +2,7 @@ ARCH=x86_64
CLANG_TRIPLE=x86_64-linux-gnu-
CROSS_COMPILE=x86_64-linux-androidkernel-
LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gas/linux-x86
FILES="
arch/x86/boot/bzImage

View File

@@ -163,7 +163,6 @@ config CRYPTO_USER
config CRYPTO_MANAGER_DISABLE_TESTS
bool "Disable run-time self tests"
default y
depends on CRYPTO_MANAGER2
help
Disable run-time self tests that normally take place at
algorithm registration.
@@ -257,6 +256,17 @@ config CRYPTO_GLUE_HELPER_X86
config CRYPTO_ENGINE
tristate
config CRYPTO_CURVE25519
tristate "Curve25519 algorithm"
select CRYPTO_KPP
select CRYPTO_LIB_CURVE25519_GENERIC
config CRYPTO_CURVE25519_X86
tristate "x86_64 accelerated Curve25519 scalar multiplication library"
depends on X86 && 64BIT
select CRYPTO_LIB_CURVE25519_GENERIC
select CRYPTO_ARCH_HAVE_LIB_CURVE25519
comment "Authenticated Encryption with Associated Data"
config CRYPTO_CCM
@@ -498,12 +508,12 @@ config CRYPTO_KEYWRAP
config CRYPTO_NHPOLY1305
tristate
select CRYPTO_HASH
select CRYPTO_POLY1305
select CRYPTO_LIB_POLY1305_GENERIC
config CRYPTO_ADIANTUM
tristate "Adiantum support"
select CRYPTO_CHACHA20
select CRYPTO_POLY1305
select CRYPTO_LIB_POLY1305_GENERIC
select CRYPTO_NHPOLY1305
help
Adiantum is a tweakable, length-preserving encryption mode
@@ -638,6 +648,30 @@ config CRYPTO_CRC32_MIPS
instructions, when available.
config CRYPTO_BLAKE2S
tristate "BLAKE2s digest algorithm"
select CRYPTO_LIB_BLAKE2S_GENERIC
select CRYPTO_HASH
help
Implementation of cryptographic hash function BLAKE2s
optimized for 8-32bit platforms and can produce digests of any size
between 1 to 32. The keyed hash is also implemented.
This module provides the following algorithms:
- blake2s-128
- blake2s-160
- blake2s-224
- blake2s-256
See https://blake2.net for further information.
config CRYPTO_BLAKE2S_X86
tristate "BLAKE2s digest algorithm (x86 accelerated version)"
depends on X86 && 64BIT
select CRYPTO_LIB_BLAKE2S_GENERIC
select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
config CRYPTO_CRCT10DIF
tristate "CRCT10DIF algorithm"
select CRYPTO_HASH
@@ -684,6 +718,7 @@ config CRYPTO_GHASH
config CRYPTO_POLY1305
tristate "Poly1305 authenticator algorithm"
select CRYPTO_HASH
select CRYPTO_LIB_POLY1305_GENERIC
help
Poly1305 authenticator algorithm, RFC7539.
@@ -694,7 +729,8 @@ config CRYPTO_POLY1305
config CRYPTO_POLY1305_X86_64
tristate "Poly1305 authenticator algorithm (x86_64/SSE2/AVX2)"
depends on X86 && 64BIT
select CRYPTO_POLY1305
select CRYPTO_LIB_POLY1305_GENERIC
select CRYPTO_ARCH_HAVE_LIB_POLY1305
help
Poly1305 authenticator algorithm, RFC7539.
@@ -703,6 +739,11 @@ config CRYPTO_POLY1305_X86_64
in IETF protocols. This is the x86_64 assembler implementation using SIMD
instructions.
config CRYPTO_POLY1305_MIPS
tristate "Poly1305 authenticator algorithm (MIPS optimized)"
depends on CPU_MIPS32 || (CPU_MIPS64 && 64BIT)
select CRYPTO_ARCH_HAVE_LIB_POLY1305
config CRYPTO_MD4
tristate "MD4 digest algorithm"
select CRYPTO_HASH
@@ -1467,6 +1508,7 @@ config CRYPTO_SALSA20
config CRYPTO_CHACHA20
tristate "ChaCha stream cipher algorithms"
select CRYPTO_LIB_CHACHA_GENERIC
select CRYPTO_BLKCIPHER
help
The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms.
@@ -1487,19 +1529,20 @@ config CRYPTO_CHACHA20
in some performance-sensitive scenarios.
config CRYPTO_CHACHA20_X86_64
tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)"
tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)"
depends on X86 && 64BIT
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
select CRYPTO_LIB_CHACHA_GENERIC
select CRYPTO_ARCH_HAVE_LIB_CHACHA
help
ChaCha20 cipher algorithm, RFC7539.
SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
XChaCha20, and XChaCha12 stream ciphers.
ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J.
Bernstein and further specified in RFC7539 for use in IETF protocols.
This is the x86_64 assembler implementation using SIMD instructions.
See also:
<http://cr.yp.to/chacha/chacha-20080128.pdf>
config CRYPTO_CHACHA_MIPS
tristate "ChaCha stream cipher algorithms (MIPS 32r2 optimized)"
depends on CPU_MIPS32_R2
select CRYPTO_BLKCIPHER
select CRYPTO_ARCH_HAVE_LIB_CHACHA
config CRYPTO_SEED
tristate "SEED cipher algorithm"
@@ -1901,6 +1944,7 @@ config CRYPTO_USER_API_AEAD
config CRYPTO_HASH_INFO
bool
source "lib/crypto/Kconfig"
source "drivers/crypto/Kconfig"
source crypto/asymmetric_keys/Kconfig
source certs/Kconfig

View File

@@ -73,6 +73,7 @@ obj-$(CONFIG_CRYPTO_SM3) += sm3_generic.o
obj-$(CONFIG_CRYPTO_WP512) += wp512.o
CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
obj-$(CONFIG_CRYPTO_BLAKE2S) += blake2s_generic.o
obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o
obj-$(CONFIG_CRYPTO_ECB) += ecb.o
obj-$(CONFIG_CRYPTO_CBC) += cbc.o
@@ -149,6 +150,7 @@ ecdh_generic-y := ecc.o
ecdh_generic-y += ecdh.o
ecdh_generic-y += ecdh_helper.o
obj-$(CONFIG_CRYPTO_ECDH) += ecdh_generic.o
obj-$(CONFIG_CRYPTO_CURVE25519) += curve25519-generic.o
#
# generic algorithms and the async_tx api

View File

@@ -33,6 +33,7 @@
#include <crypto/b128ops.h>
#include <crypto/chacha.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/poly1305.h>
#include <crypto/internal/skcipher.h>
#include <crypto/nhpoly1305.h>
#include <crypto/scatterwalk.h>
@@ -71,7 +72,7 @@ struct adiantum_tfm_ctx {
struct crypto_skcipher *streamcipher;
struct crypto_cipher *blockcipher;
struct crypto_shash *hash;
struct poly1305_key header_hash_key;
struct poly1305_core_key header_hash_key;
};
struct adiantum_request_ctx {
@@ -242,13 +243,13 @@ static void adiantum_hash_header(struct skcipher_request *req)
BUILD_BUG_ON(sizeof(header) % POLY1305_BLOCK_SIZE != 0);
poly1305_core_blocks(&state, &tctx->header_hash_key,
&header, sizeof(header) / POLY1305_BLOCK_SIZE);
&header, sizeof(header) / POLY1305_BLOCK_SIZE, 1);
BUILD_BUG_ON(TWEAK_SIZE % POLY1305_BLOCK_SIZE != 0);
poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv,
TWEAK_SIZE / POLY1305_BLOCK_SIZE);
TWEAK_SIZE / POLY1305_BLOCK_SIZE, 1);
poly1305_core_emit(&state, &rctx->header_hash);
poly1305_core_emit(&state, NULL, &rctx->header_hash);
}
/* Hash the left-hand part (the "bulk") of the message using NHPoly1305 */

View File

@@ -82,7 +82,7 @@ static int crypto_aead_copy_sgl(struct crypto_skcipher *null_tfm,
SKCIPHER_REQUEST_ON_STACK(skreq, null_tfm);
skcipher_request_set_tfm(skreq, null_tfm);
skcipher_request_set_callback(skreq, CRYPTO_TFM_REQ_MAY_BACKLOG,
skcipher_request_set_callback(skreq, CRYPTO_TFM_REQ_MAY_SLEEP,
NULL, NULL);
skcipher_request_set_crypt(skreq, src, dst, len, NULL);
@@ -295,19 +295,20 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
areq->outlen = outlen;
aead_request_set_callback(&areq->cra_u.aead_req,
CRYPTO_TFM_REQ_MAY_BACKLOG,
CRYPTO_TFM_REQ_MAY_SLEEP,
af_alg_async_cb, areq);
err = ctx->enc ? crypto_aead_encrypt(&areq->cra_u.aead_req) :
crypto_aead_decrypt(&areq->cra_u.aead_req);
/* AIO operation in progress */
if (err == -EINPROGRESS || err == -EBUSY)
if (err == -EINPROGRESS)
return -EIOCBQUEUED;
sock_put(sk);
} else {
/* Synchronous operation */
aead_request_set_callback(&areq->cra_u.aead_req,
CRYPTO_TFM_REQ_MAY_SLEEP |
CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &ctx->wait);
err = crypto_wait_req(ctx->enc ?

View File

@@ -127,7 +127,7 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
crypto_skcipher_decrypt(&areq->cra_u.skcipher_req);
/* AIO operation in progress */
if (err == -EINPROGRESS || err == -EBUSY)
if (err == -EINPROGRESS)
return -EIOCBQUEUED;
sock_put(sk);

170
crypto/blake2s_generic.c Normal file
View File

@@ -0,0 +1,170 @@
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <crypto/internal/blake2s.h>
#include <crypto/internal/hash.h>
#include <linux/types.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
unsigned int keylen)
{
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
memcpy(tctx->key, key, keylen);
tctx->keylen = keylen;
return 0;
}
static int crypto_blake2s_init(struct shash_desc *desc)
{
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct blake2s_state *state = shash_desc_ctx(desc);
const int outlen = crypto_shash_digestsize(desc->tfm);
if (tctx->keylen)
blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
else
blake2s_init(state, outlen);
return 0;
}
static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
unsigned int inlen)
{
struct blake2s_state *state = shash_desc_ctx(desc);
const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
if (unlikely(!inlen))
return 0;
if (inlen > fill) {
memcpy(state->buf + state->buflen, in, fill);
blake2s_compress_generic(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
state->buflen = 0;
in += fill;
inlen -= fill;
}
if (inlen > BLAKE2S_BLOCK_SIZE) {
const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
/* Hash one less (full) block than strictly possible */
blake2s_compress_generic(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
}
memcpy(state->buf + state->buflen, in, inlen);
state->buflen += inlen;
return 0;
}
static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
{
struct blake2s_state *state = shash_desc_ctx(desc);
blake2s_set_lastblock(state);
memset(state->buf + state->buflen, 0,
BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
blake2s_compress_generic(state, state->buf, 1, state->buflen);
cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
memcpy(out, state->h, state->outlen);
memzero_explicit(state, sizeof(*state));
return 0;
}
static struct shash_alg blake2s_algs[] = {{
.base.cra_name = "blake2s-128",
.base.cra_driver_name = "blake2s-128-generic",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_128_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}, {
.base.cra_name = "blake2s-160",
.base.cra_driver_name = "blake2s-160-generic",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_160_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}, {
.base.cra_name = "blake2s-224",
.base.cra_driver_name = "blake2s-224-generic",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_224_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}, {
.base.cra_name = "blake2s-256",
.base.cra_driver_name = "blake2s-256-generic",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_256_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}};
static int __init blake2s_mod_init(void)
{
return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
}
static void __exit blake2s_mod_exit(void)
{
crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
}
subsys_initcall(blake2s_mod_init);
module_exit(blake2s_mod_exit);
MODULE_ALIAS_CRYPTO("blake2s-128");
MODULE_ALIAS_CRYPTO("blake2s-128-generic");
MODULE_ALIAS_CRYPTO("blake2s-160");
MODULE_ALIAS_CRYPTO("blake2s-160-generic");
MODULE_ALIAS_CRYPTO("blake2s-224");
MODULE_ALIAS_CRYPTO("blake2s-224-generic");
MODULE_ALIAS_CRYPTO("blake2s-256");
MODULE_ALIAS_CRYPTO("blake2s-256-generic");
MODULE_LICENSE("GPL v2");

View File

@@ -12,33 +12,12 @@
#include <asm/unaligned.h>
#include <crypto/algapi.h>
#include <crypto/chacha.h>
#include <crypto/internal/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/module.h>
static void chacha_docrypt(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes, int nrounds)
{
/* aligned to potentially speed up crypto_xor() */
u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
if (dst != src)
memcpy(dst, src, bytes);
while (bytes >= CHACHA_BLOCK_SIZE) {
chacha_block(state, stream, nrounds);
crypto_xor(dst, stream, CHACHA_BLOCK_SIZE);
bytes -= CHACHA_BLOCK_SIZE;
dst += CHACHA_BLOCK_SIZE;
}
if (bytes) {
chacha_block(state, stream, nrounds);
crypto_xor(dst, stream, bytes);
}
}
static int chacha_stream_xor(struct skcipher_request *req,
struct chacha_ctx *ctx, u8 *iv)
const struct chacha_ctx *ctx, const u8 *iv)
{
struct skcipher_walk walk;
u32 state[16];
@@ -46,7 +25,7 @@ static int chacha_stream_xor(struct skcipher_request *req,
err = skcipher_walk_virt(&walk, req, false);
crypto_chacha_init(state, ctx, iv);
chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
@@ -54,75 +33,23 @@ static int chacha_stream_xor(struct skcipher_request *req,
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
chacha_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
nbytes, ctx->nrounds);
chacha_crypt_generic(state, walk.dst.virt.addr,
walk.src.virt.addr, nbytes, ctx->nrounds);
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
void crypto_chacha_init(u32 *state, struct chacha_ctx *ctx, u8 *iv)
{
state[0] = 0x61707865; /* "expa" */
state[1] = 0x3320646e; /* "nd 3" */
state[2] = 0x79622d32; /* "2-by" */
state[3] = 0x6b206574; /* "te k" */
state[4] = ctx->key[0];
state[5] = ctx->key[1];
state[6] = ctx->key[2];
state[7] = ctx->key[3];
state[8] = ctx->key[4];
state[9] = ctx->key[5];
state[10] = ctx->key[6];
state[11] = ctx->key[7];
state[12] = get_unaligned_le32(iv + 0);
state[13] = get_unaligned_le32(iv + 4);
state[14] = get_unaligned_le32(iv + 8);
state[15] = get_unaligned_le32(iv + 12);
}
EXPORT_SYMBOL_GPL(crypto_chacha_init);
static int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keysize, int nrounds)
{
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
int i;
if (keysize != CHACHA_KEY_SIZE)
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
ctx->nrounds = nrounds;
return 0;
}
int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keysize)
{
return chacha_setkey(tfm, key, keysize, 20);
}
EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keysize)
{
return chacha_setkey(tfm, key, keysize, 12);
}
EXPORT_SYMBOL_GPL(crypto_chacha12_setkey);
int crypto_chacha_crypt(struct skcipher_request *req)
static int crypto_chacha_crypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
return chacha_stream_xor(req, ctx, req->iv);
}
EXPORT_SYMBOL_GPL(crypto_chacha_crypt);
int crypto_xchacha_crypt(struct skcipher_request *req)
static int crypto_xchacha_crypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
@@ -131,8 +58,8 @@ int crypto_xchacha_crypt(struct skcipher_request *req)
u8 real_iv[16];
/* Compute the subkey given the original key and first 128 nonce bits */
crypto_chacha_init(state, ctx, req->iv);
hchacha_block(state, subctx.key, ctx->nrounds);
chacha_init_generic(state, ctx->key, req->iv);
hchacha_block_generic(state, subctx.key, ctx->nrounds);
subctx.nrounds = ctx->nrounds;
/* Build the real IV */
@@ -142,7 +69,6 @@ int crypto_xchacha_crypt(struct skcipher_request *req)
/* Generate the stream and XOR it with the data */
return chacha_stream_xor(req, &subctx, real_iv);
}
EXPORT_SYMBOL_GPL(crypto_xchacha_crypt);
static struct skcipher_alg algs[] = {
{
@@ -157,7 +83,7 @@ static struct skcipher_alg algs[] = {
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.setkey = chacha20_setkey,
.encrypt = crypto_chacha_crypt,
.decrypt = crypto_chacha_crypt,
}, {
@@ -172,7 +98,7 @@ static struct skcipher_alg algs[] = {
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.setkey = chacha20_setkey,
.encrypt = crypto_xchacha_crypt,
.decrypt = crypto_xchacha_crypt,
}, {
@@ -187,7 +113,7 @@ static struct skcipher_alg algs[] = {
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha12_setkey,
.setkey = chacha12_setkey,
.encrypt = crypto_xchacha_crypt,
.decrypt = crypto_xchacha_crypt,
}

View File

@@ -0,0 +1,90 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <crypto/curve25519.h>
#include <crypto/internal/kpp.h>
#include <crypto/kpp.h>
#include <linux/module.h>
#include <linux/scatterlist.h>
static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
unsigned int len)
{
u8 *secret = kpp_tfm_ctx(tfm);
if (!len)
curve25519_generate_secret(secret);
else if (len == CURVE25519_KEY_SIZE &&
crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
memcpy(secret, buf, CURVE25519_KEY_SIZE);
else
return -EINVAL;
return 0;
}
static int curve25519_compute_value(struct kpp_request *req)
{
struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
const u8 *secret = kpp_tfm_ctx(tfm);
u8 public_key[CURVE25519_KEY_SIZE];
u8 buf[CURVE25519_KEY_SIZE];
int copied, nbytes;
u8 const *bp;
if (req->src) {
copied = sg_copy_to_buffer(req->src,
sg_nents_for_len(req->src,
CURVE25519_KEY_SIZE),
public_key, CURVE25519_KEY_SIZE);
if (copied != CURVE25519_KEY_SIZE)
return -EINVAL;
bp = public_key;
} else {
bp = curve25519_base_point;
}
curve25519_generic(buf, secret, bp);
/* might want less than we've got */
nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
nbytes),
buf, nbytes);
if (copied != nbytes)
return -EINVAL;
return 0;
}
static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
{
return CURVE25519_KEY_SIZE;
}
static struct kpp_alg curve25519_alg = {
.base.cra_name = "curve25519",
.base.cra_driver_name = "curve25519-generic",
.base.cra_priority = 100,
.base.cra_module = THIS_MODULE,
.base.cra_ctxsize = CURVE25519_KEY_SIZE,
.set_secret = curve25519_set_secret,
.generate_public_key = curve25519_compute_value,
.compute_shared_secret = curve25519_compute_value,
.max_size = curve25519_max_size,
};
static int curve25519_init(void)
{
return crypto_register_kpp(&curve25519_alg);
}
static void curve25519_exit(void)
{
crypto_unregister_kpp(&curve25519_alg);
}
subsys_initcall(curve25519_init);
module_exit(curve25519_exit);
MODULE_ALIAS_CRYPTO("curve25519");
MODULE_ALIAS_CRYPTO("curve25519-generic");
MODULE_LICENSE("GPL");

View File

@@ -33,6 +33,7 @@
#include <asm/unaligned.h>
#include <crypto/algapi.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/poly1305.h>
#include <crypto/nhpoly1305.h>
#include <linux/crypto.h>
#include <linux/kernel.h>
@@ -78,7 +79,7 @@ static void process_nh_hash_value(struct nhpoly1305_state *state,
BUILD_BUG_ON(NH_HASH_BYTES % POLY1305_BLOCK_SIZE != 0);
poly1305_core_blocks(&state->poly_state, &key->poly_key, state->nh_hash,
NH_HASH_BYTES / POLY1305_BLOCK_SIZE);
NH_HASH_BYTES / POLY1305_BLOCK_SIZE, 1);
}
/*
@@ -209,7 +210,7 @@ int crypto_nhpoly1305_final_helper(struct shash_desc *desc, u8 *dst, nh_t nh_fn)
if (state->nh_remaining)
process_nh_hash_value(state, key);
poly1305_core_emit(&state->poly_state, dst);
poly1305_core_emit(&state->poly_state, NULL, dst);
return 0;
}
EXPORT_SYMBOL(crypto_nhpoly1305_final_helper);

View File

@@ -13,65 +13,33 @@
#include <crypto/algapi.h>
#include <crypto/internal/hash.h>
#include <crypto/poly1305.h>
#include <crypto/internal/poly1305.h>
#include <linux/crypto.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/unaligned.h>
static inline u64 mlt(u64 a, u64 b)
{
return a * b;
}
static inline u32 sr(u64 v, u_char n)
{
return v >> n;
}
static inline u32 and(u32 v, u32 mask)
{
return v & mask;
}
int crypto_poly1305_init(struct shash_desc *desc)
static int crypto_poly1305_init(struct shash_desc *desc)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
poly1305_core_init(&dctx->h);
dctx->buflen = 0;
dctx->rset = false;
dctx->rset = 0;
dctx->sset = false;
return 0;
}
EXPORT_SYMBOL_GPL(crypto_poly1305_init);
void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
{
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff;
key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03;
key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff;
key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff;
key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
}
EXPORT_SYMBOL_GPL(poly1305_core_setkey);
/*
* Poly1305 requires a unique key for each tag, which implies that we can't set
* it on the tfm that gets accessed by multiple users simultaneously. Instead we
* expect the key as the first 32 bytes in the update() call.
*/
unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
const u8 *src, unsigned int srclen)
static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
const u8 *src, unsigned int srclen)
{
if (!dctx->sset) {
if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
poly1305_core_setkey(&dctx->r, src);
poly1305_core_setkey(&dctx->core_r, src);
src += POLY1305_BLOCK_SIZE;
srclen -= POLY1305_BLOCK_SIZE;
dctx->rset = true;
dctx->rset = 2;
}
if (srclen >= POLY1305_BLOCK_SIZE) {
dctx->s[0] = get_unaligned_le32(src + 0);
@@ -85,86 +53,9 @@ unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
}
return srclen;
}
EXPORT_SYMBOL_GPL(crypto_poly1305_setdesckey);
static void poly1305_blocks_internal(struct poly1305_state *state,
const struct poly1305_key *key,
const void *src, unsigned int nblocks,
u32 hibit)
{
u32 r0, r1, r2, r3, r4;
u32 s1, s2, s3, s4;
u32 h0, h1, h2, h3, h4;
u64 d0, d1, d2, d3, d4;
if (!nblocks)
return;
r0 = key->r[0];
r1 = key->r[1];
r2 = key->r[2];
r3 = key->r[3];
r4 = key->r[4];
s1 = r1 * 5;
s2 = r2 * 5;
s3 = r3 * 5;
s4 = r4 * 5;
h0 = state->h[0];
h1 = state->h[1];
h2 = state->h[2];
h3 = state->h[3];
h4 = state->h[4];
do {
/* h += m[i] */
h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff;
h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
h4 += (get_unaligned_le32(src + 12) >> 8) | hibit;
/* h *= r */
d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
mlt(h3, s2) + mlt(h4, s1);
d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
mlt(h3, s3) + mlt(h4, s2);
d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
mlt(h3, s4) + mlt(h4, s3);
d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
mlt(h3, r0) + mlt(h4, s4);
d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
mlt(h3, r1) + mlt(h4, r0);
/* (partial) h %= p */
d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
src += POLY1305_BLOCK_SIZE;
} while (--nblocks);
state->h[0] = h0;
state->h[1] = h1;
state->h[2] = h2;
state->h[3] = h3;
state->h[4] = h4;
}
void poly1305_core_blocks(struct poly1305_state *state,
const struct poly1305_key *key,
const void *src, unsigned int nblocks)
{
poly1305_blocks_internal(state, key, src, nblocks, 1 << 24);
}
EXPORT_SYMBOL_GPL(poly1305_core_blocks);
static void poly1305_blocks(struct poly1305_desc_ctx *dctx,
const u8 *src, unsigned int srclen, u32 hibit)
static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int srclen)
{
unsigned int datalen;
@@ -174,12 +65,12 @@ static void poly1305_blocks(struct poly1305_desc_ctx *dctx,
srclen = datalen;
}
poly1305_blocks_internal(&dctx->h, &dctx->r,
src, srclen / POLY1305_BLOCK_SIZE, hibit);
poly1305_core_blocks(&dctx->h, &dctx->core_r, src,
srclen / POLY1305_BLOCK_SIZE, 1);
}
int crypto_poly1305_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
static int crypto_poly1305_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
unsigned int bytes;
@@ -193,13 +84,13 @@ int crypto_poly1305_update(struct shash_desc *desc,
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
poly1305_blocks(dctx, dctx->buf,
POLY1305_BLOCK_SIZE, 1 << 24);
POLY1305_BLOCK_SIZE);
dctx->buflen = 0;
}
}
if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
poly1305_blocks(dctx, src, srclen, 1 << 24);
poly1305_blocks(dctx, src, srclen);
src += srclen - (srclen % POLY1305_BLOCK_SIZE);
srclen %= POLY1305_BLOCK_SIZE;
}
@@ -211,87 +102,17 @@ int crypto_poly1305_update(struct shash_desc *desc,
return 0;
}
EXPORT_SYMBOL_GPL(crypto_poly1305_update);
void poly1305_core_emit(const struct poly1305_state *state, void *dst)
{
u32 h0, h1, h2, h3, h4;
u32 g0, g1, g2, g3, g4;
u32 mask;
/* fully carry h */
h0 = state->h[0];
h1 = state->h[1];
h2 = state->h[2];
h3 = state->h[3];
h4 = state->h[4];
h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
/* compute h + -p */
g0 = h0 + 5;
g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
/* select h if h < p, or h + -p if h >= p */
mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
g0 &= mask;
g1 &= mask;
g2 &= mask;
g3 &= mask;
g4 &= mask;
mask = ~mask;
h0 = (h0 & mask) | g0;
h1 = (h1 & mask) | g1;
h2 = (h2 & mask) | g2;
h3 = (h3 & mask) | g3;
h4 = (h4 & mask) | g4;
/* h = h % (2^128) */
put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0);
put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4);
put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8);
put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12);
}
EXPORT_SYMBOL_GPL(poly1305_core_emit);
int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
__le32 digest[4];
u64 f = 0;
if (unlikely(!dctx->sset))
return -ENOKEY;
if (unlikely(dctx->buflen)) {
dctx->buf[dctx->buflen++] = 1;
memset(dctx->buf + dctx->buflen, 0,
POLY1305_BLOCK_SIZE - dctx->buflen);
poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
}
poly1305_core_emit(&dctx->h, digest);
/* mac = (h + s) % (2^128) */
f = (f >> 32) + le32_to_cpu(digest[0]) + dctx->s[0];
put_unaligned_le32(f, dst + 0);
f = (f >> 32) + le32_to_cpu(digest[1]) + dctx->s[1];
put_unaligned_le32(f, dst + 4);
f = (f >> 32) + le32_to_cpu(digest[2]) + dctx->s[2];
put_unaligned_le32(f, dst + 8);
f = (f >> 32) + le32_to_cpu(digest[3]) + dctx->s[3];
put_unaligned_le32(f, dst + 12);
poly1305_final_generic(dctx, dst);
return 0;
}
EXPORT_SYMBOL_GPL(crypto_poly1305_final);
static struct shash_alg poly1305_alg = {
.digestsize = POLY1305_DIGEST_SIZE,

View File

@@ -2616,6 +2616,30 @@ static const struct alg_test_desc alg_test_descs[] = {
.alg = "authenc(hmac(sha512),rfc3686(ctr(aes)))",
.test = alg_test_null,
.fips_allowed = 1,
}, {
.alg = "blake2s-128",
.test = alg_test_hash,
.suite = {
.hash = __VECS(blakes2s_128_tv_template)
}
}, {
.alg = "blake2s-160",
.test = alg_test_hash,
.suite = {
.hash = __VECS(blakes2s_160_tv_template)
}
}, {
.alg = "blake2s-224",
.test = alg_test_hash,
.suite = {
.hash = __VECS(blakes2s_224_tv_template)
}
}, {
.alg = "blake2s-256",
.test = alg_test_hash,
.suite = {
.hash = __VECS(blakes2s_256_tv_template)
}
}, {
.alg = "cbc(aes)",
.test = alg_test_skcipher,
@@ -2821,6 +2845,12 @@ static const struct alg_test_desc alg_test_descs[] = {
.suite = {
.cipher = __VECS(cts_mode_tv_template)
}
}, {
.alg = "curve25519",
.test = alg_test_kpp,
.suite = {
.kpp = __VECS(curve25519_tv_template)
}
}, {
.alg = "deflate",
.test = alg_test_comp,

File diff suppressed because it is too large Load Diff

View File

@@ -237,7 +237,7 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
struct binder_work {
struct list_head entry;
enum {
enum binder_work_type {
BINDER_WORK_TRANSACTION = 1,
BINDER_WORK_TRANSACTION_COMPLETE,
BINDER_WORK_RETURN_ERROR,
@@ -897,27 +897,6 @@ static struct binder_work *binder_dequeue_work_head_ilocked(
return w;
}
/**
* binder_dequeue_work_head() - Dequeues the item at head of list
* @proc: binder_proc associated with list
* @list: list to dequeue head
*
* Removes the head of the list if there are items on the list
*
* Return: pointer dequeued binder_work, NULL if list was empty
*/
static struct binder_work *binder_dequeue_work_head(
struct binder_proc *proc,
struct list_head *list)
{
struct binder_work *w;
binder_inner_proc_lock(proc);
w = binder_dequeue_work_head_ilocked(list);
binder_inner_proc_unlock(proc);
return w;
}
static void
binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer);
static void binder_free_thread(struct binder_thread *thread);
@@ -4552,13 +4531,17 @@ static void binder_release_work(struct binder_proc *proc,
struct list_head *list)
{
struct binder_work *w;
enum binder_work_type wtype;
while (1) {
w = binder_dequeue_work_head(proc, list);
binder_inner_proc_lock(proc);
w = binder_dequeue_work_head_ilocked(list);
wtype = w ? w->type : 0;
binder_inner_proc_unlock(proc);
if (!w)
return;
switch (w->type) {
switch (wtype) {
case BINDER_WORK_TRANSACTION: {
struct binder_transaction *t;
@@ -4592,9 +4575,11 @@ static void binder_release_work(struct binder_proc *proc,
kfree(death);
binder_stats_deleted(BINDER_STAT_DEATH);
} break;
case BINDER_WORK_NODE:
break;
default:
pr_err("unexpected work type, %d, not freed\n",
w->type);
wtype);
break;
}
}

View File

@@ -481,7 +481,8 @@ static int really_probe(struct device *dev, struct device_driver *drv)
drv->bus->name, __func__, drv->name, dev_name(dev));
if (!list_empty(&dev->devres_head)) {
dev_crit(dev, "Resources present before probing\n");
return -EBUSY;
ret = -EBUSY;
goto done;
}
re_probe:
@@ -588,7 +589,7 @@ pinctrl_bind_failed:
ret = 0;
done:
atomic_dec(&probe_count);
wake_up(&probe_waitqueue);
wake_up_all(&probe_waitqueue);
return ret;
}

Some files were not shown because too many files have changed in this diff Show More