Commit 797994f8 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto update from Herbert Xu:

 - XTS mode optimisation for twofish/cast6/camellia/aes on x86

 - AVX2/x86_64 implementation for blowfish/twofish/serpent/camellia

 - SSSE3/AVX/AVX2 optimisations for sha256/sha512

 - Added driver for SAHARA2 crypto accelerator

 - Fix for GMAC when used in non-IPsec secnarios

 - Added generic CMAC implementation (including IPsec glue)

 - IP update for crypto/atmel

 - Support for more than one device in hwrng/timeriomem

 - Added Broadcom BCM2835 RNG driver

 - Misc fixes

* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (59 commits)
  crypto: caam - fix job ring cleanup code
  crypto: camellia - add AVX2/AES-NI/x86_64 assembler implementation of camellia cipher
  crypto: serpent - add AVX2/x86_64 assembler implementation of serpent cipher
  crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher
  crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher
  crypto: tcrypt - add async cipher speed tests for blowfish
  crypto: testmgr - extend camellia test-vectors for camellia-aesni/avx2
  crypto: aesni_intel - fix Kconfig problem with CRYPTO_GLUE_HELPER_X86
  crypto: aesni_intel - add more optimized XTS mode for x86-64
  crypto: x86/camellia-aesni-avx - add more optimized XTS code
  crypto: cast6-avx: use new optimized XTS code
  crypto: x86/twofish-avx - use optimized XTS code
  crypto: x86 - add more optimized XTS-mode for serpent-avx
  xfrm: add rfc4494 AES-CMAC-96 support
  crypto: add CMAC support to CryptoAPI
  crypto: testmgr - add empty test vectors for null ciphers
  crypto: testmgr - add AES GMAC test vectors
  crypto: gcm - fix rfc4543 to handle async crypto correctly
  crypto: gcm - make GMAC work when dst and src are different
  hwrng: timeriomem - added devicetree hooks
  ...
parents c8d85669 3862de1f
Freescale SAHARA Cryptographic Accelerator included in some i.MX chips.
Currently only i.MX27 is supported.
Required properties:
- compatible : Should be "fsl,<soc>-sahara"
- reg : Should contain SAHARA registers location and length
- interrupts : Should contain SAHARA interrupt number
Example:
sah@10025000 {
compatible = "fsl,imx27-sahara";
reg = < 0x10025000 0x800>;
interrupts = <75>;
};
HWRNG support for the timeriomem_rng driver
Required properties:
- compatible : "timeriomem_rng"
- reg : base address to sample from
- period : wait time in microseconds to use between samples
N.B. currently 'reg' must be four bytes wide and aligned
Example:
hwrng@44 {
#address-cells = <1>;
#size-cells = <1>;
compatible = "timeriomem_rng";
reg = <0x44 0x04>;
period = <1000000>;
};
BCM2835 Random number generator
Required properties:
- compatible : should be "brcm,bcm2835-rng"
- reg : Specifies base physical address and size of the registers.
Example:
rng {
compatible = "brcm,bcm2835-rng";
reg = <0x7e104000 0x10>;
};
......@@ -63,7 +63,7 @@ Intel RNG Driver notes:
* FIXME: support poll(2)
NOTE: request_mem_region was removed, for two reasons:
NOTE: request_mem_region was removed, for three reasons:
1) Only one RNG is supported by this driver, 2) The location
used by the RNG is a fixed location in MMIO-addressable memory,
3) users with properly working BIOS e820 handling will always
......
......@@ -18,7 +18,7 @@
#include <linux/platform_device.h>
#include <linux/i2c-gpio.h>
#include <linux/atmel-mci.h>
#include <linux/platform_data/atmel-aes.h>
#include <linux/platform_data/crypto-atmel.h>
#include <linux/platform_data/at91_adc.h>
......@@ -1900,7 +1900,8 @@ static void __init at91_add_device_tdes(void) {}
* -------------------------------------------------------------------- */
#if defined(CONFIG_CRYPTO_DEV_ATMEL_AES) || defined(CONFIG_CRYPTO_DEV_ATMEL_AES_MODULE)
static struct aes_platform_data aes_data;
static struct crypto_platform_data aes_data;
static struct crypto_dma_data alt_atslave;
static u64 aes_dmamask = DMA_BIT_MASK(32);
static struct resource aes_resources[] = {
......@@ -1931,23 +1932,20 @@ static struct platform_device at91sam9g45_aes_device = {
static void __init at91_add_device_aes(void)
{
struct at_dma_slave *atslave;
struct aes_dma_data *alt_atslave;
alt_atslave = kzalloc(sizeof(struct aes_dma_data), GFP_KERNEL);
/* DMA TX slave channel configuration */
atslave = &alt_atslave->txdata;
atslave = &alt_atslave.txdata;
atslave->dma_dev = &at_hdmac_device.dev;
atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_SRC_H2SEL_HW |
ATC_SRC_PER(AT_DMA_ID_AES_RX);
/* DMA RX slave channel configuration */
atslave = &alt_atslave->rxdata;
atslave = &alt_atslave.rxdata;
atslave->dma_dev = &at_hdmac_device.dev;
atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_DST_H2SEL_HW |
ATC_DST_PER(AT_DMA_ID_AES_TX);
aes_data.dma_slave = alt_atslave;
aes_data.dma_slave = &alt_atslave;
platform_device_register(&at91sam9g45_aes_device);
}
#else
......
......@@ -2,6 +2,10 @@
# Arch-specific CryptoAPI modules.
#
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
......@@ -12,22 +16,37 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
# These modules require assembler to support AVX.
ifeq ($(avx_supported),yes)
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
camellia-aesni-avx-x86_64.o
obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
endif
# These modules require assembler to support AVX2.
ifeq ($(avx2_supported),yes)
obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
endif
aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
......@@ -36,21 +55,35 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
camellia_aesni_avx_glue.o
cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
ifeq ($(avx_supported),yes)
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
camellia_aesni_avx_glue.o
cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \
twofish_avx_glue.o
serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \
serpent_avx_glue.o
endif
ifeq ($(avx2_supported),yes)
blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
crc32c-intel-y := crc32c-intel_glue.o
crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
......@@ -34,6 +34,10 @@
#ifdef __x86_64__
.data
.align 16
.Lgf128mul_x_ble_mask:
.octa 0x00000000000000010000000000000087
POLY: .octa 0xC2000000000000000000000000000001
TWOONE: .octa 0x00000001000000000000000000000001
......@@ -105,6 +109,8 @@ enc: .octa 0x2
#define CTR %xmm11
#define INC %xmm12
#define GF128MUL_MASK %xmm10
#ifdef __x86_64__
#define AREG %rax
#define KEYP %rdi
......@@ -2636,4 +2642,115 @@ ENTRY(aesni_ctr_enc)
.Lctr_enc_just_ret:
ret
ENDPROC(aesni_ctr_enc)
/*
* _aesni_gf128mul_x_ble: internal ABI
* Multiply in GF(2^128) for XTS IVs
* input:
* IV: current IV
* GF128MUL_MASK == mask with 0x87 and 0x01
* output:
* IV: next IV
* changed:
* CTR: == temporary value
*/
#define _aesni_gf128mul_x_ble() \
pshufd $0x13, IV, CTR; \
paddq IV, IV; \
psrad $31, CTR; \
pand GF128MUL_MASK, CTR; \
pxor CTR, IV;
/*
* void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
* bool enc, u8 *iv)
*/
ENTRY(aesni_xts_crypt8)
cmpb $0, %cl
movl $0, %ecx
movl $240, %r10d
leaq _aesni_enc4, %r11
leaq _aesni_dec4, %rax
cmovel %r10d, %ecx
cmoveq %rax, %r11
movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
movups (IVP), IV
mov 480(KEYP), KLEN
addq %rcx, KEYP
movdqa IV, STATE1
pxor 0x00(INP), STATE1
movdqu IV, 0x00(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE2
pxor 0x10(INP), STATE2
movdqu IV, 0x10(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE3
pxor 0x20(INP), STATE3
movdqu IV, 0x20(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE4
pxor 0x30(INP), STATE4
movdqu IV, 0x30(OUTP)
call *%r11
pxor 0x00(OUTP), STATE1
movdqu STATE1, 0x00(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE1
pxor 0x40(INP), STATE1
movdqu IV, 0x40(OUTP)
pxor 0x10(OUTP), STATE2
movdqu STATE2, 0x10(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE2
pxor 0x50(INP), STATE2
movdqu IV, 0x50(OUTP)
pxor 0x20(OUTP), STATE3
movdqu STATE3, 0x20(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE3
pxor 0x60(INP), STATE3
movdqu IV, 0x60(OUTP)
pxor 0x30(OUTP), STATE4
movdqu STATE4, 0x30(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE4
pxor 0x70(INP), STATE4
movdqu IV, 0x70(OUTP)
_aesni_gf128mul_x_ble()
movups IV, (IVP)
call *%r11
pxor 0x40(OUTP), STATE1
movdqu STATE1, 0x40(OUTP)
pxor 0x50(OUTP), STATE2
movdqu STATE2, 0x50(OUTP)
pxor 0x60(OUTP), STATE3
movdqu STATE3, 0x60(OUTP)
pxor 0x70(OUTP), STATE4
movdqu STATE4, 0x70(OUTP)
ret
ENDPROC(aesni_xts_crypt8)
#endif
......@@ -39,6 +39,9 @@
#include <crypto/internal/aead.h>
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#ifdef CONFIG_X86_64
#include <asm/crypto/glue_helper.h>
#endif
#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
#define HAS_PCBC
......@@ -102,6 +105,9 @@ void crypto_fpu_exit(void);
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, bool enc, u8 *iv);
/* asmlinkage void aesni_gcm_enc()
* void *ctx, AES Key schedule. Starts on a 16 byte boundary.
* u8 *out, Ciphertext output. Encrypt in-place is allowed.
......@@ -510,6 +516,78 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in)
aesni_enc(ctx, out, in);
}
#ifdef CONFIG_X86_64
static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc));
}
static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec));
}
static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv);
}
static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv);
}
static const struct common_glue_ctx aesni_enc_xts = {
.num_funcs = 2,
.fpu_blocks_limit = 1,
.funcs = { {
.num_blocks = 8,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) }
}, {
.num_blocks = 1,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) }
} }
};
static const struct common_glue_ctx aesni_dec_xts = {
.num_funcs = 2,
.fpu_blocks_limit = 1,
.funcs = { {
.num_blocks = 8,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) }
}, {
.num_blocks = 1,
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) }
} }
};
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes,
XTS_TWEAK_CAST(aesni_xts_tweak),
aes_ctx(ctx->raw_tweak_ctx),
aes_ctx(ctx->raw_crypt_ctx));
}
static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes,
XTS_TWEAK_CAST(aesni_xts_tweak),
aes_ctx(ctx->raw_tweak_ctx),
aes_ctx(ctx->raw_crypt_ctx));
}
#else
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
......@@ -560,6 +638,8 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
return ret;
}
#endif
#ifdef CONFIG_X86_64
static int rfc4106_init(struct crypto_tfm *tfm)
{
......
/*
* x86_64/AVX2 assembler optimized version of Blowfish
*
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
*/
#include <linux/linkage.h>
.file "blowfish-avx2-asm_64.S"
.data
.align 32
.Lprefetch_mask:
.long 0*64
.long 1*64
.long 2*64
.long 3*64
.long 4*64
.long 5*64
.long 6*64
.long 7*64
.Lbswap32_mask:
.long 0x00010203
.long 0x04050607
.long 0x08090a0b
.long 0x0c0d0e0f
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.Lbswap_iv_mask:
.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
.text
/* structure of crypto context */
#define p 0
#define s0 ((16 + 2) * 4)
#define s1 ((16 + 2 + (1 * 256)) * 4)
#define s2 ((16 + 2 + (2 * 256)) * 4)
#define s3 ((16 + 2 + (3 * 256)) * 4)
/* register macros */
#define CTX %rdi
#define RIO %rdx
#define RS0 %rax
#define RS1 %r8
#define RS2 %r9
#define RS3 %r10
#define RLOOP %r11
#define RLOOPd %r11d
#define RXr0 %ymm8
#define RXr1 %ymm9
#define RXr2 %ymm10
#define RXr3 %ymm11
#define RXl0 %ymm12
#define RXl1 %ymm13
#define RXl2 %ymm14
#define RXl3 %ymm15
/* temp regs */
#define RT0 %ymm0
#define RT0x %xmm0
#define RT1 %ymm1
#define RT1x %xmm1
#define RIDX0 %ymm2
#define RIDX1 %ymm3
#define RIDX1x %xmm3
#define RIDX2 %ymm4
#define RIDX3 %ymm5
/* vpgatherdd mask and '-1' */
#define RNOT %ymm6
/* byte mask, (-1 >> 24) */
#define RBYTE %ymm7
/***********************************************************************
* 32-way AVX2 blowfish
***********************************************************************/
#define F(xl, xr) \
vpsrld $24, xl, RIDX0; \
vpsrld $16, xl, RIDX1; \
vpsrld $8, xl, RIDX2; \
vpand RBYTE, RIDX1, RIDX1; \
vpand RBYTE, RIDX2, RIDX2; \
vpand RBYTE, xl, RIDX3; \
\
vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
vpcmpeqd RNOT, RNOT, RNOT; \
vpcmpeqd RIDX0, RIDX0, RIDX0; \
\
vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
vpcmpeqd RIDX1, RIDX1, RIDX1; \
vpaddd RT0, RT1, RT0; \
\
vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
vpxor RT0, RT1, RT0; \
\
vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
vpcmpeqd RNOT, RNOT, RNOT; \
vpaddd RT0, RT1, RT0; \
\
vpxor RT0, xr, xr;
#define add_roundkey(xl, nmem) \
vpbroadcastd nmem, RT0; \
vpxor RT0, xl ## 0, xl ## 0; \
vpxor RT0, xl ## 1, xl ## 1; \
vpxor RT0, xl ## 2, xl ## 2; \
vpxor RT0, xl ## 3, xl ## 3;
#define round_enc() \
add_roundkey(RXr, p(CTX,RLOOP,4)); \
F(RXl0, RXr0); \
F(RXl1, RXr1); \
F(RXl2, RXr2); \
F(RXl3, RXr3); \
\
add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
F(RXr0, RXl0); \
F(RXr1, RXl1); \
F(RXr2, RXl2); \
F(RXr3, RXl3);
#define round_dec() \
add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
F(RXl0, RXr0); \
F(RXl1, RXr1); \
F(RXl2, RXr2); \
F(RXl3, RXr3); \
\
add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
F(RXr0, RXl0); \
F(RXr1, RXl1); \
F(RXr2, RXl2); \
F(RXr3, RXl3);
#define init_round_constants() \
vpcmpeqd RNOT, RNOT, RNOT; \
leaq s0(CTX), RS0; \
leaq s1(CTX), RS1; \
leaq s2(CTX), RS2; \
leaq s3(CTX), RS3; \
vpsrld $24, RNOT, RBYTE;
#define transpose_2x2(x0, x1, t0) \
vpunpckldq x0, x1, t0; \
vpunpckhdq x0, x1, x1; \
\
vpunpcklqdq t0, x1, x0; \
vpunpckhqdq t0, x1, x1;
#define read_block(xl, xr) \
vbroadcasti128 .Lbswap32_mask, RT1; \