mirror of
https://github.com/Luzifer/cloudkeys-go.git
synced 2024-11-10 07:00:08 +00:00
Knut Ahlers
a1df72edc5
commitf0db1ff1f8
Author: Knut Ahlers <knut@ahlers.me> Date: Sun Dec 24 12:19:56 2017 +0100 Mark option as deprecated Signed-off-by: Knut Ahlers <knut@ahlers.me> commit9891df2a16
Author: Knut Ahlers <knut@ahlers.me> Date: Sun Dec 24 12:11:56 2017 +0100 Fix: Typo Signed-off-by: Knut Ahlers <knut@ahlers.me> commit836006de64
Author: Knut Ahlers <knut@ahlers.me> Date: Sun Dec 24 12:04:20 2017 +0100 Add new dependencies Signed-off-by: Knut Ahlers <knut@ahlers.me> commitd64fee60c8
Author: Knut Ahlers <knut@ahlers.me> Date: Sun Dec 24 11:55:52 2017 +0100 Replace insecure password hashing Prior this commit passwords were hashed with a static salt and using the SHA1 hashing function. This could lead to passwords being attackable in case someone gets access to the raw data stored inside the database. This commit introduces password hashing using bcrypt hashing function which addresses this issue. Old passwords are not automatically re-hashed as they are unknown. Replacing the old password scheme is not that easy and needs #10 to be solved. Therefore the old hashing scheme is kept for compatibility reason. Signed-off-by: Knut Ahlers <knut@ahlers.me> Signed-off-by: Knut Ahlers <knut@ahlers.me> closes #14 closes #15
2714 lines
106 KiB
ArmAsm
2714 lines
106 KiB
ArmAsm
// Copyright 2016 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
|
|
|
|
// +build go1.7,amd64,!gccgo,!appengine
|
|
|
|
#include "textflag.h"
|
|
// General register allocation
|
|
#define oup DI
|
|
#define inp SI
|
|
#define inl BX
|
|
#define adp CX // free to reuse, after we hash the additional data
|
|
#define keyp R8 // free to reuse, when we copy the key to stack
|
|
#define itr2 R9 // general iterator
|
|
#define itr1 CX // general iterator
|
|
#define acc0 R10
|
|
#define acc1 R11
|
|
#define acc2 R12
|
|
#define t0 R13
|
|
#define t1 R14
|
|
#define t2 R15
|
|
#define t3 R8
|
|
// Register and stack allocation for the SSE code
|
|
#define rStore (0*16)(BP)
|
|
#define sStore (1*16)(BP)
|
|
#define state1Store (2*16)(BP)
|
|
#define state2Store (3*16)(BP)
|
|
#define tmpStore (4*16)(BP)
|
|
#define ctr0Store (5*16)(BP)
|
|
#define ctr1Store (6*16)(BP)
|
|
#define ctr2Store (7*16)(BP)
|
|
#define ctr3Store (8*16)(BP)
|
|
#define A0 X0
|
|
#define A1 X1
|
|
#define A2 X2
|
|
#define B0 X3
|
|
#define B1 X4
|
|
#define B2 X5
|
|
#define C0 X6
|
|
#define C1 X7
|
|
#define C2 X8
|
|
#define D0 X9
|
|
#define D1 X10
|
|
#define D2 X11
|
|
#define T0 X12
|
|
#define T1 X13
|
|
#define T2 X14
|
|
#define T3 X15
|
|
#define A3 T0
|
|
#define B3 T1
|
|
#define C3 T2
|
|
#define D3 T3
|
|
// Register and stack allocation for the AVX2 code
|
|
#define rsStoreAVX2 (0*32)(BP)
|
|
#define state1StoreAVX2 (1*32)(BP)
|
|
#define state2StoreAVX2 (2*32)(BP)
|
|
#define ctr0StoreAVX2 (3*32)(BP)
|
|
#define ctr1StoreAVX2 (4*32)(BP)
|
|
#define ctr2StoreAVX2 (5*32)(BP)
|
|
#define ctr3StoreAVX2 (6*32)(BP)
|
|
#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
|
|
#define AA0 Y0
|
|
#define AA1 Y5
|
|
#define AA2 Y6
|
|
#define AA3 Y7
|
|
#define BB0 Y14
|
|
#define BB1 Y9
|
|
#define BB2 Y10
|
|
#define BB3 Y11
|
|
#define CC0 Y12
|
|
#define CC1 Y13
|
|
#define CC2 Y8
|
|
#define CC3 Y15
|
|
#define DD0 Y4
|
|
#define DD1 Y1
|
|
#define DD2 Y2
|
|
#define DD3 Y3
|
|
#define TT0 DD3
|
|
#define TT1 AA3
|
|
#define TT2 BB3
|
|
#define TT3 CC3
|
|
// ChaCha20 constants
|
|
DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
|
|
DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
|
|
DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
|
|
DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
|
|
DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
|
|
DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
|
|
DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
|
|
DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
|
|
// <<< 16 with PSHUFB
|
|
DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
|
|
DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
|
DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
|
|
DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
|
|
// <<< 8 with PSHUFB
|
|
DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
|
|
DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
|
DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
|
|
DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
|
|
|
|
DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
|
|
DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
|
|
DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
|
|
DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
|
|
|
|
DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
|
|
DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
|
|
DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
|
|
DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
|
|
// Poly1305 key clamp
|
|
DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
|
|
DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
|
|
DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
|
|
DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
|
|
|
|
DATA ·sseIncMask<>+0x00(SB)/8, $0x1
|
|
DATA ·sseIncMask<>+0x08(SB)/8, $0x0
|
|
// To load/store the last < 16 bytes in a buffer
|
|
DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
|
|
DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
|
|
DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
|
|
DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
|
|
DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
|
|
DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
|
|
DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
|
|
DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
|
|
DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
|
|
DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
|
|
DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
|
|
DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
|
|
DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
|
|
DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
|
|
|
|
GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
|
|
GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
|
|
GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
|
|
GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
|
|
GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
|
|
GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
|
|
GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
|
|
GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
|
|
// No PALIGNR in Go ASM yet (but VPALIGNR is present).
|
|
#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
|
|
#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
|
|
#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
|
|
#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
|
|
#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
|
|
#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
|
|
#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
|
|
#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
|
|
#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
|
|
#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
|
|
#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
|
|
#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
|
|
#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
|
|
#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
|
|
#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
|
|
#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
|
|
#define shiftC0Right shiftC0Left
|
|
#define shiftC1Right shiftC1Left
|
|
#define shiftC2Right shiftC2Left
|
|
#define shiftC3Right shiftC3Left
|
|
#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
|
|
#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
|
|
#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
|
|
#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
|
|
// Some macros
|
|
#define chachaQR(A, B, C, D, T) \
|
|
PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \
|
|
PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
|
|
PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \
|
|
PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
|
|
|
|
#define chachaQR_AVX2(A, B, C, D, T) \
|
|
VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
|
|
VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
|
|
VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
|
|
VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
|
|
|
|
#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
|
|
#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
|
|
#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
|
|
#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
|
|
#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
|
|
|
|
#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
|
|
#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
|
|
#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
|
|
|
|
#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
|
|
#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
|
|
// ----------------------------------------------------------------------------
|
|
TEXT polyHashADInternal<>(SB), NOSPLIT, $0
|
|
// adp points to beginning of additional data
|
|
// itr2 holds ad length
|
|
XORQ acc0, acc0
|
|
XORQ acc1, acc1
|
|
XORQ acc2, acc2
|
|
CMPQ itr2, $13
|
|
JNE hashADLoop
|
|
|
|
openFastTLSAD:
|
|
// Special treatment for the TLS case of 13 bytes
|
|
MOVQ (adp), acc0
|
|
MOVQ 5(adp), acc1
|
|
SHRQ $24, acc1
|
|
MOVQ $1, acc2
|
|
polyMul
|
|
RET
|
|
|
|
hashADLoop:
|
|
// Hash in 16 byte chunks
|
|
CMPQ itr2, $16
|
|
JB hashADTail
|
|
polyAdd(0(adp))
|
|
LEAQ (1*16)(adp), adp
|
|
SUBQ $16, itr2
|
|
polyMul
|
|
JMP hashADLoop
|
|
|
|
hashADTail:
|
|
CMPQ itr2, $0
|
|
JE hashADDone
|
|
|
|
// Hash last < 16 byte tail
|
|
XORQ t0, t0
|
|
XORQ t1, t1
|
|
XORQ t2, t2
|
|
ADDQ itr2, adp
|
|
|
|
hashADTailLoop:
|
|
SHLQ $8, t1:t0
|
|
SHLQ $8, t0
|
|
MOVB -1(adp), t2
|
|
XORQ t2, t0
|
|
DECQ adp
|
|
DECQ itr2
|
|
JNE hashADTailLoop
|
|
|
|
hashADTailFinish:
|
|
ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
|
|
polyMul
|
|
|
|
// Finished AD
|
|
hashADDone:
|
|
RET
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// func chacha20Poly1305Open(dst, key, src, ad []byte) bool
|
|
TEXT ·chacha20Poly1305Open(SB), 0, $288-97
|
|
// For aligned stack access
|
|
MOVQ SP, BP
|
|
ADDQ $32, BP
|
|
ANDQ $-32, BP
|
|
MOVQ dst+0(FP), oup
|
|
MOVQ key+24(FP), keyp
|
|
MOVQ src+48(FP), inp
|
|
MOVQ src_len+56(FP), inl
|
|
MOVQ ad+72(FP), adp
|
|
|
|
// Check for AVX2 support
|
|
CMPB ·useAVX2(SB), $1
|
|
JE chacha20Poly1305Open_AVX2
|
|
|
|
// Special optimization, for very short buffers
|
|
CMPQ inl, $128
|
|
JBE openSSE128 // About 16% faster
|
|
|
|
// For long buffers, prepare the poly key first
|
|
MOVOU ·chacha20Constants<>(SB), A0
|
|
MOVOU (1*16)(keyp), B0
|
|
MOVOU (2*16)(keyp), C0
|
|
MOVOU (3*16)(keyp), D0
|
|
MOVO D0, T1
|
|
|
|
// Store state on stack for future use
|
|
MOVO B0, state1Store
|
|
MOVO C0, state2Store
|
|
MOVO D0, ctr3Store
|
|
MOVQ $10, itr2
|
|
|
|
openSSEPreparePolyKey:
|
|
chachaQR(A0, B0, C0, D0, T0)
|
|
shiftB0Left; shiftC0Left; shiftD0Left
|
|
chachaQR(A0, B0, C0, D0, T0)
|
|
shiftB0Right; shiftC0Right; shiftD0Right
|
|
DECQ itr2
|
|
JNE openSSEPreparePolyKey
|
|
|
|
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
|
|
PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
|
|
|
|
// Clamp and store the key
|
|
PAND ·polyClampMask<>(SB), A0
|
|
MOVO A0, rStore; MOVO B0, sStore
|
|
|
|
// Hash AAD
|
|
MOVQ ad_len+80(FP), itr2
|
|
CALL polyHashADInternal<>(SB)
|
|
|
|
openSSEMainLoop:
|
|
CMPQ inl, $256
|
|
JB openSSEMainLoopDone
|
|
|
|
// Load state, increment counter blocks
|
|
MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
|
|
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
|
|
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
|
|
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
|
|
|
|
// Store counters
|
|
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
|
|
|
|
// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
|
|
MOVQ $4, itr1
|
|
MOVQ inp, itr2
|
|
|
|
openSSEInternalLoop:
|
|
MOVO C3, tmpStore
|
|
chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
|
|
MOVO tmpStore, C3
|
|
MOVO C1, tmpStore
|
|
chachaQR(A3, B3, C3, D3, C1)
|
|
MOVO tmpStore, C1
|
|
polyAdd(0(itr2))
|
|
shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
|
|
shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
|
|
shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
|
|
polyMulStage1
|
|
polyMulStage2
|
|
LEAQ (2*8)(itr2), itr2
|
|
MOVO C3, tmpStore
|
|
chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
|
|
MOVO tmpStore, C3
|
|
MOVO C1, tmpStore
|
|
polyMulStage3
|
|
chachaQR(A3, B3, C3, D3, C1)
|
|
MOVO tmpStore, C1
|
|
polyMulReduceStage
|
|
shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
|
|
shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
|
|
shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
|
|
DECQ itr1
|
|
JGE openSSEInternalLoop
|
|
|
|
polyAdd(0(itr2))
|
|
polyMul
|
|
LEAQ (2*8)(itr2), itr2
|
|
|
|
CMPQ itr1, $-6
|
|
JG openSSEInternalLoop
|
|
|
|
// Add in the state
|
|
PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
|
|
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
|
|
PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
|
|
PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
|
|
|
|
// Load - xor - store
|
|
MOVO D3, tmpStore
|
|
MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
|
|
MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
|
|
MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
|
|
MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
|
|
MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
|
|
MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
|
|
MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
|
|
MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
|
|
MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
|
|
MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
|
|
MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
|
|
MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
|
|
MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
|
|
MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
|
|
MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
|
|
MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
|
|
LEAQ 256(inp), inp
|
|
LEAQ 256(oup), oup
|
|
SUBQ $256, inl
|
|
JMP openSSEMainLoop
|
|
|
|
openSSEMainLoopDone:
|
|
// Handle the various tail sizes efficiently
|
|
TESTQ inl, inl
|
|
JE openSSEFinalize
|
|
CMPQ inl, $64
|
|
JBE openSSETail64
|
|
CMPQ inl, $128
|
|
JBE openSSETail128
|
|
CMPQ inl, $192
|
|
JBE openSSETail192
|
|
JMP openSSETail256
|
|
|
|
openSSEFinalize:
|
|
// Hash in the PT, AAD lengths
|
|
ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
|
|
polyMul
|
|
|
|
// Final reduce
|
|
MOVQ acc0, t0
|
|
MOVQ acc1, t1
|
|
MOVQ acc2, t2
|
|
SUBQ $-5, acc0
|
|
SBBQ $-1, acc1
|
|
SBBQ $3, acc2
|
|
CMOVQCS t0, acc0
|
|
CMOVQCS t1, acc1
|
|
CMOVQCS t2, acc2
|
|
|
|
// Add in the "s" part of the key
|
|
ADDQ 0+sStore, acc0
|
|
ADCQ 8+sStore, acc1
|
|
|
|
// Finally, constant time compare to the tag at the end of the message
|
|
XORQ AX, AX
|
|
MOVQ $1, DX
|
|
XORQ (0*8)(inp), acc0
|
|
XORQ (1*8)(inp), acc1
|
|
ORQ acc1, acc0
|
|
CMOVQEQ DX, AX
|
|
|
|
// Return true iff tags are equal
|
|
MOVB AX, ret+96(FP)
|
|
RET
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for buffers smaller than 129 bytes
|
|
openSSE128:
|
|
// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
|
|
MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
|
|
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
|
|
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
|
|
MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
|
|
MOVQ $10, itr2
|
|
|
|
openSSE128InnerCipherLoop:
|
|
chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
|
|
shiftB0Left; shiftB1Left; shiftB2Left
|
|
shiftC0Left; shiftC1Left; shiftC2Left
|
|
shiftD0Left; shiftD1Left; shiftD2Left
|
|
chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
|
|
shiftB0Right; shiftB1Right; shiftB2Right
|
|
shiftC0Right; shiftC1Right; shiftC2Right
|
|
shiftD0Right; shiftD1Right; shiftD2Right
|
|
DECQ itr2
|
|
JNE openSSE128InnerCipherLoop
|
|
|
|
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
|
|
PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
|
|
PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
|
|
PADDL T2, C1; PADDL T2, C2
|
|
PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
|
|
|
|
// Clamp and store the key
|
|
PAND ·polyClampMask<>(SB), A0
|
|
MOVOU A0, rStore; MOVOU B0, sStore
|
|
|
|
// Hash
|
|
MOVQ ad_len+80(FP), itr2
|
|
CALL polyHashADInternal<>(SB)
|
|
|
|
openSSE128Open:
|
|
CMPQ inl, $16
|
|
JB openSSETail16
|
|
SUBQ $16, inl
|
|
|
|
// Load for hashing
|
|
polyAdd(0(inp))
|
|
|
|
// Load for decryption
|
|
MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
|
|
LEAQ (1*16)(inp), inp
|
|
LEAQ (1*16)(oup), oup
|
|
polyMul
|
|
|
|
// Shift the stream "left"
|
|
MOVO B1, A1
|
|
MOVO C1, B1
|
|
MOVO D1, C1
|
|
MOVO A2, D1
|
|
MOVO B2, A2
|
|
MOVO C2, B2
|
|
MOVO D2, C2
|
|
JMP openSSE128Open
|
|
|
|
openSSETail16:
|
|
TESTQ inl, inl
|
|
JE openSSEFinalize
|
|
|
|
// We can safely load the CT from the end, because it is padded with the MAC
|
|
MOVQ inl, itr2
|
|
SHLQ $4, itr2
|
|
LEAQ ·andMask<>(SB), t0
|
|
MOVOU (inp), T0
|
|
ADDQ inl, inp
|
|
PAND -16(t0)(itr2*1), T0
|
|
MOVO T0, 0+tmpStore
|
|
MOVQ T0, t0
|
|
MOVQ 8+tmpStore, t1
|
|
PXOR A1, T0
|
|
|
|
// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
|
|
openSSETail16Store:
|
|
MOVQ T0, t3
|
|
MOVB t3, (oup)
|
|
PSRLDQ $1, T0
|
|
INCQ oup
|
|
DECQ inl
|
|
JNE openSSETail16Store
|
|
ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
|
|
polyMul
|
|
JMP openSSEFinalize
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 64 bytes of ciphertext
|
|
openSSETail64:
|
|
// Need to decrypt up to 64 bytes - prepare single block
|
|
MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
|
|
XORQ itr2, itr2
|
|
MOVQ inl, itr1
|
|
CMPQ itr1, $16
|
|
JB openSSETail64LoopB
|
|
|
|
openSSETail64LoopA:
|
|
// Perform ChaCha rounds, while hashing the remaining input
|
|
polyAdd(0(inp)(itr2*1))
|
|
polyMul
|
|
SUBQ $16, itr1
|
|
|
|
openSSETail64LoopB:
|
|
ADDQ $16, itr2
|
|
chachaQR(A0, B0, C0, D0, T0)
|
|
shiftB0Left; shiftC0Left; shiftD0Left
|
|
chachaQR(A0, B0, C0, D0, T0)
|
|
shiftB0Right; shiftC0Right; shiftD0Right
|
|
|
|
CMPQ itr1, $16
|
|
JAE openSSETail64LoopA
|
|
|
|
CMPQ itr2, $160
|
|
JNE openSSETail64LoopB
|
|
|
|
PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
|
|
|
|
openSSETail64DecLoop:
|
|
CMPQ inl, $16
|
|
JB openSSETail64DecLoopDone
|
|
SUBQ $16, inl
|
|
MOVOU (inp), T0
|
|
PXOR T0, A0
|
|
MOVOU A0, (oup)
|
|
LEAQ 16(inp), inp
|
|
LEAQ 16(oup), oup
|
|
MOVO B0, A0
|
|
MOVO C0, B0
|
|
MOVO D0, C0
|
|
JMP openSSETail64DecLoop
|
|
|
|
openSSETail64DecLoopDone:
|
|
MOVO A0, A1
|
|
JMP openSSETail16
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 128 bytes of ciphertext
|
|
openSSETail128:
|
|
// Need to decrypt up to 128 bytes - prepare two blocks
|
|
MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
|
|
MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
|
|
XORQ itr2, itr2
|
|
MOVQ inl, itr1
|
|
ANDQ $-16, itr1
|
|
|
|
openSSETail128LoopA:
|
|
// Perform ChaCha rounds, while hashing the remaining input
|
|
polyAdd(0(inp)(itr2*1))
|
|
polyMul
|
|
|
|
openSSETail128LoopB:
|
|
ADDQ $16, itr2
|
|
chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
|
|
shiftB0Left; shiftC0Left; shiftD0Left
|
|
shiftB1Left; shiftC1Left; shiftD1Left
|
|
chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
|
|
shiftB0Right; shiftC0Right; shiftD0Right
|
|
shiftB1Right; shiftC1Right; shiftD1Right
|
|
|
|
CMPQ itr2, itr1
|
|
JB openSSETail128LoopA
|
|
|
|
CMPQ itr2, $160
|
|
JNE openSSETail128LoopB
|
|
|
|
PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
|
|
PADDL state1Store, B0; PADDL state1Store, B1
|
|
PADDL state2Store, C0; PADDL state2Store, C1
|
|
PADDL ctr1Store, D0; PADDL ctr0Store, D1
|
|
|
|
MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
|
|
PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
|
|
MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
|
|
|
|
SUBQ $64, inl
|
|
LEAQ 64(inp), inp
|
|
LEAQ 64(oup), oup
|
|
JMP openSSETail64DecLoop
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 192 bytes of ciphertext
|
|
openSSETail192:
|
|
// Need to decrypt up to 192 bytes - prepare three blocks
|
|
MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
|
|
MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
|
|
MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
|
|
|
|
MOVQ inl, itr1
|
|
MOVQ $160, itr2
|
|
CMPQ itr1, $160
|
|
CMOVQGT itr2, itr1
|
|
ANDQ $-16, itr1
|
|
XORQ itr2, itr2
|
|
|
|
openSSLTail192LoopA:
|
|
// Perform ChaCha rounds, while hashing the remaining input
|
|
polyAdd(0(inp)(itr2*1))
|
|
polyMul
|
|
|
|
openSSLTail192LoopB:
|
|
ADDQ $16, itr2
|
|
chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
|
|
shiftB0Left; shiftC0Left; shiftD0Left
|
|
shiftB1Left; shiftC1Left; shiftD1Left
|
|
shiftB2Left; shiftC2Left; shiftD2Left
|
|
|
|
chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
|
|
shiftB0Right; shiftC0Right; shiftD0Right
|
|
shiftB1Right; shiftC1Right; shiftD1Right
|
|
shiftB2Right; shiftC2Right; shiftD2Right
|
|
|
|
CMPQ itr2, itr1
|
|
JB openSSLTail192LoopA
|
|
|
|
CMPQ itr2, $160
|
|
JNE openSSLTail192LoopB
|
|
|
|
CMPQ inl, $176
|
|
JB openSSLTail192Store
|
|
|
|
polyAdd(160(inp))
|
|
polyMul
|
|
|
|
CMPQ inl, $192
|
|
JB openSSLTail192Store
|
|
|
|
polyAdd(176(inp))
|
|
polyMul
|
|
|
|
openSSLTail192Store:
|
|
PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
|
|
PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
|
|
PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
|
|
PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
|
|
|
|
MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
|
|
PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
|
|
MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
|
|
|
|
MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
|
|
PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
|
|
MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
|
|
|
|
SUBQ $128, inl
|
|
LEAQ 128(inp), inp
|
|
LEAQ 128(oup), oup
|
|
JMP openSSETail64DecLoop
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 256 bytes of ciphertext
|
|
openSSETail256:
|
|
// Need to decrypt up to 256 bytes - prepare four blocks
|
|
MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
|
|
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
|
|
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
|
|
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
|
|
|
|
// Store counters
|
|
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
|
|
XORQ itr2, itr2
|
|
|
|
openSSETail256Loop:
|
|
// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
|
|
polyAdd(0(inp)(itr2*1))
|
|
MOVO C3, tmpStore
|
|
chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
|
|
MOVO tmpStore, C3
|
|
MOVO C1, tmpStore
|
|
chachaQR(A3, B3, C3, D3, C1)
|
|
MOVO tmpStore, C1
|
|
shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
|
|
shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
|
|
shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
|
|
polyMulStage1
|
|
polyMulStage2
|
|
MOVO C3, tmpStore
|
|
chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
|
|
MOVO tmpStore, C3
|
|
MOVO C1, tmpStore
|
|
chachaQR(A3, B3, C3, D3, C1)
|
|
MOVO tmpStore, C1
|
|
polyMulStage3
|
|
polyMulReduceStage
|
|
shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
|
|
shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
|
|
shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
|
|
ADDQ $2*8, itr2
|
|
CMPQ itr2, $160
|
|
JB openSSETail256Loop
|
|
MOVQ inl, itr1
|
|
ANDQ $-16, itr1
|
|
|
|
openSSETail256HashLoop:
|
|
polyAdd(0(inp)(itr2*1))
|
|
polyMul
|
|
ADDQ $2*8, itr2
|
|
CMPQ itr2, itr1
|
|
JB openSSETail256HashLoop
|
|
|
|
// Add in the state
|
|
PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
|
|
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
|
|
PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
|
|
PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
|
|
MOVO D3, tmpStore
|
|
|
|
// Load - xor - store
|
|
MOVOU (0*16)(inp), D3; PXOR D3, A0
|
|
MOVOU (1*16)(inp), D3; PXOR D3, B0
|
|
MOVOU (2*16)(inp), D3; PXOR D3, C0
|
|
MOVOU (3*16)(inp), D3; PXOR D3, D0
|
|
MOVOU A0, (0*16)(oup)
|
|
MOVOU B0, (1*16)(oup)
|
|
MOVOU C0, (2*16)(oup)
|
|
MOVOU D0, (3*16)(oup)
|
|
MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
|
|
PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
|
|
MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
|
|
MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
|
|
PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
|
|
MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
|
|
LEAQ 192(inp), inp
|
|
LEAQ 192(oup), oup
|
|
SUBQ $192, inl
|
|
MOVO A3, A0
|
|
MOVO B3, B0
|
|
MOVO C3, C0
|
|
MOVO tmpStore, D0
|
|
|
|
JMP openSSETail64DecLoop
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// ------------------------- AVX2 Code ----------------------------------------
|
|
chacha20Poly1305Open_AVX2:
|
|
VZEROUPPER
|
|
VMOVDQU ·chacha20Constants<>(SB), AA0
|
|
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
|
|
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
|
|
BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
|
|
VPADDD ·avx2InitMask<>(SB), DD0, DD0
|
|
|
|
// Special optimization, for very short buffers
|
|
CMPQ inl, $192
|
|
JBE openAVX2192
|
|
CMPQ inl, $320
|
|
JBE openAVX2320
|
|
|
|
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
|
|
VMOVDQA BB0, state1StoreAVX2
|
|
VMOVDQA CC0, state2StoreAVX2
|
|
VMOVDQA DD0, ctr3StoreAVX2
|
|
MOVQ $10, itr2
|
|
|
|
openAVX2PreparePolyKey:
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
|
|
DECQ itr2
|
|
JNE openAVX2PreparePolyKey
|
|
|
|
VPADDD ·chacha20Constants<>(SB), AA0, AA0
|
|
VPADDD state1StoreAVX2, BB0, BB0
|
|
VPADDD state2StoreAVX2, CC0, CC0
|
|
VPADDD ctr3StoreAVX2, DD0, DD0
|
|
|
|
VPERM2I128 $0x02, AA0, BB0, TT0
|
|
|
|
// Clamp and store poly key
|
|
VPAND ·polyClampMask<>(SB), TT0, TT0
|
|
VMOVDQA TT0, rsStoreAVX2
|
|
|
|
// Stream for the first 64 bytes
|
|
VPERM2I128 $0x13, AA0, BB0, AA0
|
|
VPERM2I128 $0x13, CC0, DD0, BB0
|
|
|
|
// Hash AD + first 64 bytes
|
|
MOVQ ad_len+80(FP), itr2
|
|
CALL polyHashADInternal<>(SB)
|
|
XORQ itr1, itr1
|
|
|
|
openAVX2InitialHash64:
|
|
polyAdd(0(inp)(itr1*1))
|
|
polyMulAVX2
|
|
ADDQ $16, itr1
|
|
CMPQ itr1, $64
|
|
JNE openAVX2InitialHash64
|
|
|
|
// Decrypt the first 64 bytes
|
|
VPXOR (0*32)(inp), AA0, AA0
|
|
VPXOR (1*32)(inp), BB0, BB0
|
|
VMOVDQU AA0, (0*32)(oup)
|
|
VMOVDQU BB0, (1*32)(oup)
|
|
LEAQ (2*32)(inp), inp
|
|
LEAQ (2*32)(oup), oup
|
|
SUBQ $64, inl
|
|
|
|
openAVX2MainLoop:
|
|
CMPQ inl, $512
|
|
JB openAVX2MainLoopDone
|
|
|
|
// Load state, increment counter blocks, store the incremented counters
|
|
VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
|
|
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
|
|
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
|
|
VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
|
|
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
|
|
XORQ itr1, itr1
|
|
|
|
openAVX2InternalLoop:
|
|
// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
|
|
// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
|
|
polyAdd(0*8(inp)(itr1*1))
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
polyMulStage1_AVX2
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
|
|
polyMulStage2_AVX2
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
polyMulStage3_AVX2
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
polyMulReduceStage
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
|
|
polyAdd(2*8(inp)(itr1*1))
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
polyMulStage1_AVX2
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
polyMulStage2_AVX2
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
|
|
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
polyMulStage3_AVX2
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
|
|
polyMulReduceStage
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
polyAdd(4*8(inp)(itr1*1))
|
|
LEAQ (6*8)(itr1), itr1
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
polyMulStage1_AVX2
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
polyMulStage2_AVX2
|
|
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
polyMulStage3_AVX2
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
polyMulReduceStage
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
|
|
VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
|
|
CMPQ itr1, $480
|
|
JNE openAVX2InternalLoop
|
|
|
|
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
|
|
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
|
|
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
|
|
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
|
|
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
|
|
polyAdd(480(inp))
|
|
polyMulAVX2
|
|
VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
|
|
VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
|
|
VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
|
|
VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
|
|
VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
|
|
VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
|
|
|
|
// and here
|
|
polyAdd(496(inp))
|
|
polyMulAVX2
|
|
VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
|
|
VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
|
|
VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
|
|
VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
|
|
VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
|
|
VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
|
|
LEAQ (32*16)(inp), inp
|
|
LEAQ (32*16)(oup), oup
|
|
SUBQ $(32*16), inl
|
|
JMP openAVX2MainLoop
|
|
|
|
openAVX2MainLoopDone:
|
|
// Handle the various tail sizes efficiently
|
|
TESTQ inl, inl
|
|
JE openSSEFinalize
|
|
CMPQ inl, $128
|
|
JBE openAVX2Tail128
|
|
CMPQ inl, $256
|
|
JBE openAVX2Tail256
|
|
CMPQ inl, $384
|
|
JBE openAVX2Tail384
|
|
JMP openAVX2Tail512
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for buffers smaller than 193 bytes
|
|
openAVX2192:
|
|
// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
|
|
VMOVDQA AA0, AA1
|
|
VMOVDQA BB0, BB1
|
|
VMOVDQA CC0, CC1
|
|
VPADDD ·avx2IncMask<>(SB), DD0, DD1
|
|
VMOVDQA AA0, AA2
|
|
VMOVDQA BB0, BB2
|
|
VMOVDQA CC0, CC2
|
|
VMOVDQA DD0, DD2
|
|
VMOVDQA DD1, TT3
|
|
MOVQ $10, itr2
|
|
|
|
openAVX2192InnerCipherLoop:
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
|
|
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
|
|
VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
|
|
DECQ itr2
|
|
JNE openAVX2192InnerCipherLoop
|
|
VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
|
|
VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
|
|
VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
|
|
VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
|
|
VPERM2I128 $0x02, AA0, BB0, TT0
|
|
|
|
// Clamp and store poly key
|
|
VPAND ·polyClampMask<>(SB), TT0, TT0
|
|
VMOVDQA TT0, rsStoreAVX2
|
|
|
|
// Stream for up to 192 bytes
|
|
VPERM2I128 $0x13, AA0, BB0, AA0
|
|
VPERM2I128 $0x13, CC0, DD0, BB0
|
|
VPERM2I128 $0x02, AA1, BB1, CC0
|
|
VPERM2I128 $0x02, CC1, DD1, DD0
|
|
VPERM2I128 $0x13, AA1, BB1, AA1
|
|
VPERM2I128 $0x13, CC1, DD1, BB1
|
|
|
|
openAVX2ShortOpen:
|
|
// Hash
|
|
MOVQ ad_len+80(FP), itr2
|
|
CALL polyHashADInternal<>(SB)
|
|
|
|
openAVX2ShortOpenLoop:
|
|
CMPQ inl, $32
|
|
JB openAVX2ShortTail32
|
|
SUBQ $32, inl
|
|
|
|
// Load for hashing
|
|
polyAdd(0*8(inp))
|
|
polyMulAVX2
|
|
polyAdd(2*8(inp))
|
|
polyMulAVX2
|
|
|
|
// Load for decryption
|
|
VPXOR (inp), AA0, AA0
|
|
VMOVDQU AA0, (oup)
|
|
LEAQ (1*32)(inp), inp
|
|
LEAQ (1*32)(oup), oup
|
|
|
|
// Shift stream left
|
|
VMOVDQA BB0, AA0
|
|
VMOVDQA CC0, BB0
|
|
VMOVDQA DD0, CC0
|
|
VMOVDQA AA1, DD0
|
|
VMOVDQA BB1, AA1
|
|
VMOVDQA CC1, BB1
|
|
VMOVDQA DD1, CC1
|
|
VMOVDQA AA2, DD1
|
|
VMOVDQA BB2, AA2
|
|
JMP openAVX2ShortOpenLoop
|
|
|
|
openAVX2ShortTail32:
|
|
CMPQ inl, $16
|
|
VMOVDQA A0, A1
|
|
JB openAVX2ShortDone
|
|
|
|
SUBQ $16, inl
|
|
|
|
// Load for hashing
|
|
polyAdd(0*8(inp))
|
|
polyMulAVX2
|
|
|
|
// Load for decryption
|
|
VPXOR (inp), A0, T0
|
|
VMOVDQU T0, (oup)
|
|
LEAQ (1*16)(inp), inp
|
|
LEAQ (1*16)(oup), oup
|
|
VPERM2I128 $0x11, AA0, AA0, AA0
|
|
VMOVDQA A0, A1
|
|
|
|
openAVX2ShortDone:
|
|
VZEROUPPER
|
|
JMP openSSETail16
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for buffers smaller than 321 bytes
|
|
openAVX2320:
|
|
// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
|
|
VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
|
|
VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
|
|
VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
|
|
MOVQ $10, itr2
|
|
|
|
openAVX2320InnerCipherLoop:
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
|
|
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
|
|
VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
|
|
DECQ itr2
|
|
JNE openAVX2320InnerCipherLoop
|
|
|
|
VMOVDQA ·chacha20Constants<>(SB), TT0
|
|
VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
|
|
VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
|
|
VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
|
|
VMOVDQA ·avx2IncMask<>(SB), TT0
|
|
VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
|
|
VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
|
|
VPADDD TT3, DD2, DD2
|
|
|
|
// Clamp and store poly key
|
|
VPERM2I128 $0x02, AA0, BB0, TT0
|
|
VPAND ·polyClampMask<>(SB), TT0, TT0
|
|
VMOVDQA TT0, rsStoreAVX2
|
|
|
|
// Stream for up to 320 bytes
|
|
VPERM2I128 $0x13, AA0, BB0, AA0
|
|
VPERM2I128 $0x13, CC0, DD0, BB0
|
|
VPERM2I128 $0x02, AA1, BB1, CC0
|
|
VPERM2I128 $0x02, CC1, DD1, DD0
|
|
VPERM2I128 $0x13, AA1, BB1, AA1
|
|
VPERM2I128 $0x13, CC1, DD1, BB1
|
|
VPERM2I128 $0x02, AA2, BB2, CC1
|
|
VPERM2I128 $0x02, CC2, DD2, DD1
|
|
VPERM2I128 $0x13, AA2, BB2, AA2
|
|
VPERM2I128 $0x13, CC2, DD2, BB2
|
|
JMP openAVX2ShortOpen
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 128 bytes of ciphertext
|
|
openAVX2Tail128:
|
|
// Need to decrypt up to 128 bytes - prepare two blocks
|
|
VMOVDQA ·chacha20Constants<>(SB), AA1
|
|
VMOVDQA state1StoreAVX2, BB1
|
|
VMOVDQA state2StoreAVX2, CC1
|
|
VMOVDQA ctr3StoreAVX2, DD1
|
|
VPADDD ·avx2IncMask<>(SB), DD1, DD1
|
|
VMOVDQA DD1, DD0
|
|
|
|
XORQ itr2, itr2
|
|
MOVQ inl, itr1
|
|
ANDQ $-16, itr1
|
|
TESTQ itr1, itr1
|
|
JE openAVX2Tail128LoopB
|
|
|
|
openAVX2Tail128LoopA:
|
|
// Perform ChaCha rounds, while hashing the remaining input
|
|
polyAdd(0(inp)(itr2*1))
|
|
polyMulAVX2
|
|
|
|
openAVX2Tail128LoopB:
|
|
ADDQ $16, itr2
|
|
chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
|
|
VPALIGNR $4, BB1, BB1, BB1
|
|
VPALIGNR $8, CC1, CC1, CC1
|
|
VPALIGNR $12, DD1, DD1, DD1
|
|
chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
|
|
VPALIGNR $12, BB1, BB1, BB1
|
|
VPALIGNR $8, CC1, CC1, CC1
|
|
VPALIGNR $4, DD1, DD1, DD1
|
|
CMPQ itr2, itr1
|
|
JB openAVX2Tail128LoopA
|
|
CMPQ itr2, $160
|
|
JNE openAVX2Tail128LoopB
|
|
|
|
VPADDD ·chacha20Constants<>(SB), AA1, AA1
|
|
VPADDD state1StoreAVX2, BB1, BB1
|
|
VPADDD state2StoreAVX2, CC1, CC1
|
|
VPADDD DD0, DD1, DD1
|
|
VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
|
|
|
|
openAVX2TailLoop:
|
|
CMPQ inl, $32
|
|
JB openAVX2Tail
|
|
SUBQ $32, inl
|
|
|
|
// Load for decryption
|
|
VPXOR (inp), AA0, AA0
|
|
VMOVDQU AA0, (oup)
|
|
LEAQ (1*32)(inp), inp
|
|
LEAQ (1*32)(oup), oup
|
|
VMOVDQA BB0, AA0
|
|
VMOVDQA CC0, BB0
|
|
VMOVDQA DD0, CC0
|
|
JMP openAVX2TailLoop
|
|
|
|
openAVX2Tail:
|
|
CMPQ inl, $16
|
|
VMOVDQA A0, A1
|
|
JB openAVX2TailDone
|
|
SUBQ $16, inl
|
|
|
|
// Load for decryption
|
|
VPXOR (inp), A0, T0
|
|
VMOVDQU T0, (oup)
|
|
LEAQ (1*16)(inp), inp
|
|
LEAQ (1*16)(oup), oup
|
|
VPERM2I128 $0x11, AA0, AA0, AA0
|
|
VMOVDQA A0, A1
|
|
|
|
openAVX2TailDone:
|
|
VZEROUPPER
|
|
JMP openSSETail16
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 256 bytes of ciphertext
|
|
openAVX2Tail256:
|
|
// Need to decrypt up to 256 bytes - prepare four blocks
|
|
VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
|
|
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
|
|
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
|
|
VMOVDQA ctr3StoreAVX2, DD0
|
|
VPADDD ·avx2IncMask<>(SB), DD0, DD0
|
|
VPADDD ·avx2IncMask<>(SB), DD0, DD1
|
|
VMOVDQA DD0, TT1
|
|
VMOVDQA DD1, TT2
|
|
|
|
// Compute the number of iterations that will hash data
|
|
MOVQ inl, tmpStoreAVX2
|
|
MOVQ inl, itr1
|
|
SUBQ $128, itr1
|
|
SHRQ $4, itr1
|
|
MOVQ $10, itr2
|
|
CMPQ itr1, $10
|
|
CMOVQGT itr2, itr1
|
|
MOVQ inp, inl
|
|
XORQ itr2, itr2
|
|
|
|
openAVX2Tail256LoopA:
|
|
polyAdd(0(inl))
|
|
polyMulAVX2
|
|
LEAQ 16(inl), inl
|
|
|
|
// Perform ChaCha rounds, while hashing the remaining input
|
|
openAVX2Tail256LoopB:
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
|
|
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
|
|
INCQ itr2
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
|
|
VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
|
|
CMPQ itr2, itr1
|
|
JB openAVX2Tail256LoopA
|
|
|
|
CMPQ itr2, $10
|
|
JNE openAVX2Tail256LoopB
|
|
|
|
MOVQ inl, itr2
|
|
SUBQ inp, inl
|
|
MOVQ inl, itr1
|
|
MOVQ tmpStoreAVX2, inl
|
|
|
|
// Hash the remainder of data (if any)
|
|
openAVX2Tail256Hash:
|
|
ADDQ $16, itr1
|
|
CMPQ itr1, inl
|
|
JGT openAVX2Tail256HashEnd
|
|
polyAdd (0(itr2))
|
|
polyMulAVX2
|
|
LEAQ 16(itr2), itr2
|
|
JMP openAVX2Tail256Hash
|
|
|
|
// Store 128 bytes safely, then go to store loop
|
|
openAVX2Tail256HashEnd:
|
|
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
|
|
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
|
|
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
|
|
VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
|
|
VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
|
|
VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
|
|
|
|
VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
|
|
VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
|
|
LEAQ (4*32)(inp), inp
|
|
LEAQ (4*32)(oup), oup
|
|
SUBQ $4*32, inl
|
|
|
|
JMP openAVX2TailLoop
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 384 bytes of ciphertext
|
|
openAVX2Tail384:
|
|
// Need to decrypt up to 384 bytes - prepare six blocks
|
|
VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
|
|
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
|
|
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
|
|
VMOVDQA ctr3StoreAVX2, DD0
|
|
VPADDD ·avx2IncMask<>(SB), DD0, DD0
|
|
VPADDD ·avx2IncMask<>(SB), DD0, DD1
|
|
VPADDD ·avx2IncMask<>(SB), DD1, DD2
|
|
VMOVDQA DD0, ctr0StoreAVX2
|
|
VMOVDQA DD1, ctr1StoreAVX2
|
|
VMOVDQA DD2, ctr2StoreAVX2
|
|
|
|
// Compute the number of iterations that will hash two blocks of data
|
|
MOVQ inl, tmpStoreAVX2
|
|
MOVQ inl, itr1
|
|
SUBQ $256, itr1
|
|
SHRQ $4, itr1
|
|
ADDQ $6, itr1
|
|
MOVQ $10, itr2
|
|
CMPQ itr1, $10
|
|
CMOVQGT itr2, itr1
|
|
MOVQ inp, inl
|
|
XORQ itr2, itr2
|
|
|
|
// Perform ChaCha rounds, while hashing the remaining input
|
|
openAVX2Tail384LoopB:
|
|
polyAdd(0(inl))
|
|
polyMulAVX2
|
|
LEAQ 16(inl), inl
|
|
|
|
openAVX2Tail384LoopA:
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
|
|
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
|
|
polyAdd(0(inl))
|
|
polyMulAVX2
|
|
LEAQ 16(inl), inl
|
|
INCQ itr2
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
|
|
VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
|
|
|
|
CMPQ itr2, itr1
|
|
JB openAVX2Tail384LoopB
|
|
|
|
CMPQ itr2, $10
|
|
JNE openAVX2Tail384LoopA
|
|
|
|
MOVQ inl, itr2
|
|
SUBQ inp, inl
|
|
MOVQ inl, itr1
|
|
MOVQ tmpStoreAVX2, inl
|
|
|
|
openAVX2Tail384Hash:
|
|
ADDQ $16, itr1
|
|
CMPQ itr1, inl
|
|
JGT openAVX2Tail384HashEnd
|
|
polyAdd(0(itr2))
|
|
polyMulAVX2
|
|
LEAQ 16(itr2), itr2
|
|
JMP openAVX2Tail384Hash
|
|
|
|
// Store 256 bytes safely, then go to store loop
|
|
openAVX2Tail384HashEnd:
|
|
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
|
|
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
|
|
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
|
|
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
|
|
VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
|
|
VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
|
|
VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
|
|
VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
|
|
VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
|
|
VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
|
|
VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
|
|
LEAQ (8*32)(inp), inp
|
|
LEAQ (8*32)(oup), oup
|
|
SUBQ $8*32, inl
|
|
JMP openAVX2TailLoop
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 512 bytes of ciphertext
|
|
openAVX2Tail512:
|
|
VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
|
|
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
|
|
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
|
|
VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
|
|
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
|
|
XORQ itr1, itr1
|
|
MOVQ inp, itr2
|
|
|
|
openAVX2Tail512LoopB:
|
|
polyAdd(0(itr2))
|
|
polyMulAVX2
|
|
LEAQ (2*8)(itr2), itr2
|
|
|
|
openAVX2Tail512LoopA:
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
polyAdd(0*8(itr2))
|
|
polyMulAVX2
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
|
|
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
polyAdd(2*8(itr2))
|
|
polyMulAVX2
|
|
LEAQ (4*8)(itr2), itr2
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
|
|
VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
|
|
INCQ itr1
|
|
CMPQ itr1, $4
|
|
JLT openAVX2Tail512LoopB
|
|
|
|
CMPQ itr1, $10
|
|
JNE openAVX2Tail512LoopA
|
|
|
|
MOVQ inl, itr1
|
|
SUBQ $384, itr1
|
|
ANDQ $-16, itr1
|
|
|
|
openAVX2Tail512HashLoop:
|
|
TESTQ itr1, itr1
|
|
JE openAVX2Tail512HashEnd
|
|
polyAdd(0(itr2))
|
|
polyMulAVX2
|
|
LEAQ 16(itr2), itr2
|
|
SUBQ $16, itr1
|
|
JMP openAVX2Tail512HashLoop
|
|
|
|
openAVX2Tail512HashEnd:
|
|
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
|
|
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
|
|
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
|
|
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
|
|
VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
|
|
VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
|
|
VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
|
|
VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
|
|
VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
|
|
VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
|
|
VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
|
|
VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
|
|
VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
|
|
|
|
LEAQ (12*32)(inp), inp
|
|
LEAQ (12*32)(oup), oup
|
|
SUBQ $12*32, inl
|
|
|
|
JMP openAVX2TailLoop
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// ----------------------------------------------------------------------------
|
|
// func chacha20Poly1305Seal(dst, key, src, ad []byte)
|
|
TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
|
|
// For aligned stack access
|
|
MOVQ SP, BP
|
|
ADDQ $32, BP
|
|
ANDQ $-32, BP
|
|
MOVQ dst+0(FP), oup
|
|
MOVQ key+24(FP), keyp
|
|
MOVQ src+48(FP), inp
|
|
MOVQ src_len+56(FP), inl
|
|
MOVQ ad+72(FP), adp
|
|
|
|
CMPB ·useAVX2(SB), $1
|
|
JE chacha20Poly1305Seal_AVX2
|
|
|
|
// Special optimization, for very short buffers
|
|
CMPQ inl, $128
|
|
JBE sealSSE128 // About 15% faster
|
|
|
|
// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
|
|
MOVOU ·chacha20Constants<>(SB), A0
|
|
MOVOU (1*16)(keyp), B0
|
|
MOVOU (2*16)(keyp), C0
|
|
MOVOU (3*16)(keyp), D0
|
|
|
|
// Store state on stack for future use
|
|
MOVO B0, state1Store
|
|
MOVO C0, state2Store
|
|
|
|
// Load state, increment counter blocks
|
|
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
|
|
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
|
|
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
|
|
|
|
// Store counters
|
|
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
|
|
MOVQ $10, itr2
|
|
|
|
sealSSEIntroLoop:
|
|
MOVO C3, tmpStore
|
|
chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
|
|
MOVO tmpStore, C3
|
|
MOVO C1, tmpStore
|
|
chachaQR(A3, B3, C3, D3, C1)
|
|
MOVO tmpStore, C1
|
|
shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
|
|
shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
|
|
shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
|
|
|
|
MOVO C3, tmpStore
|
|
chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
|
|
MOVO tmpStore, C3
|
|
MOVO C1, tmpStore
|
|
chachaQR(A3, B3, C3, D3, C1)
|
|
MOVO tmpStore, C1
|
|
shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
|
|
shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
|
|
shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
|
|
DECQ itr2
|
|
JNE sealSSEIntroLoop
|
|
|
|
// Add in the state
|
|
PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
|
|
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
|
|
PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
|
|
PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
|
|
|
|
// Clamp and store the key
|
|
PAND ·polyClampMask<>(SB), A0
|
|
MOVO A0, rStore
|
|
MOVO B0, sStore
|
|
|
|
// Hash AAD
|
|
MOVQ ad_len+80(FP), itr2
|
|
CALL polyHashADInternal<>(SB)
|
|
|
|
MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
|
|
PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
|
|
MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
|
|
MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
|
|
PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
|
|
MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
|
|
|
|
MOVQ $128, itr1
|
|
SUBQ $128, inl
|
|
LEAQ 128(inp), inp
|
|
|
|
MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
|
|
|
|
CMPQ inl, $64
|
|
JBE sealSSE128SealHash
|
|
|
|
MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
|
|
PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
|
|
MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
|
|
|
|
ADDQ $64, itr1
|
|
SUBQ $64, inl
|
|
LEAQ 64(inp), inp
|
|
|
|
MOVQ $2, itr1
|
|
MOVQ $8, itr2
|
|
|
|
CMPQ inl, $64
|
|
JBE sealSSETail64
|
|
CMPQ inl, $128
|
|
JBE sealSSETail128
|
|
CMPQ inl, $192
|
|
JBE sealSSETail192
|
|
|
|
sealSSEMainLoop:
|
|
// Load state, increment counter blocks
|
|
MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
|
|
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
|
|
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
|
|
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
|
|
|
|
// Store counters
|
|
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
|
|
|
|
sealSSEInnerLoop:
|
|
MOVO C3, tmpStore
|
|
chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
|
|
MOVO tmpStore, C3
|
|
MOVO C1, tmpStore
|
|
chachaQR(A3, B3, C3, D3, C1)
|
|
MOVO tmpStore, C1
|
|
polyAdd(0(oup))
|
|
shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
|
|
shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
|
|
shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
|
|
polyMulStage1
|
|
polyMulStage2
|
|
LEAQ (2*8)(oup), oup
|
|
MOVO C3, tmpStore
|
|
chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
|
|
MOVO tmpStore, C3
|
|
MOVO C1, tmpStore
|
|
polyMulStage3
|
|
chachaQR(A3, B3, C3, D3, C1)
|
|
MOVO tmpStore, C1
|
|
polyMulReduceStage
|
|
shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
|
|
shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
|
|
shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
|
|
DECQ itr2
|
|
JGE sealSSEInnerLoop
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
LEAQ (2*8)(oup), oup
|
|
DECQ itr1
|
|
JG sealSSEInnerLoop
|
|
|
|
// Add in the state
|
|
PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
|
|
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
|
|
PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
|
|
PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
|
|
MOVO D3, tmpStore
|
|
|
|
// Load - xor - store
|
|
MOVOU (0*16)(inp), D3; PXOR D3, A0
|
|
MOVOU (1*16)(inp), D3; PXOR D3, B0
|
|
MOVOU (2*16)(inp), D3; PXOR D3, C0
|
|
MOVOU (3*16)(inp), D3; PXOR D3, D0
|
|
MOVOU A0, (0*16)(oup)
|
|
MOVOU B0, (1*16)(oup)
|
|
MOVOU C0, (2*16)(oup)
|
|
MOVOU D0, (3*16)(oup)
|
|
MOVO tmpStore, D3
|
|
|
|
MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
|
|
PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
|
|
MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
|
|
MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
|
|
PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
|
|
MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
|
|
ADDQ $192, inp
|
|
MOVQ $192, itr1
|
|
SUBQ $192, inl
|
|
MOVO A3, A1
|
|
MOVO B3, B1
|
|
MOVO C3, C1
|
|
MOVO D3, D1
|
|
CMPQ inl, $64
|
|
JBE sealSSE128SealHash
|
|
MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
|
|
PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
|
|
MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
|
|
LEAQ 64(inp), inp
|
|
SUBQ $64, inl
|
|
MOVQ $6, itr1
|
|
MOVQ $4, itr2
|
|
CMPQ inl, $192
|
|
JG sealSSEMainLoop
|
|
|
|
MOVQ inl, itr1
|
|
TESTQ inl, inl
|
|
JE sealSSE128SealHash
|
|
MOVQ $6, itr1
|
|
CMPQ inl, $64
|
|
JBE sealSSETail64
|
|
CMPQ inl, $128
|
|
JBE sealSSETail128
|
|
JMP sealSSETail192
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 64 bytes of plaintext
|
|
sealSSETail64:
|
|
// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
|
|
MOVO ·chacha20Constants<>(SB), A1
|
|
MOVO state1Store, B1
|
|
MOVO state2Store, C1
|
|
MOVO ctr3Store, D1
|
|
PADDL ·sseIncMask<>(SB), D1
|
|
MOVO D1, ctr0Store
|
|
|
|
sealSSETail64LoopA:
|
|
// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
LEAQ 16(oup), oup
|
|
|
|
sealSSETail64LoopB:
|
|
chachaQR(A1, B1, C1, D1, T1)
|
|
shiftB1Left; shiftC1Left; shiftD1Left
|
|
chachaQR(A1, B1, C1, D1, T1)
|
|
shiftB1Right; shiftC1Right; shiftD1Right
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
LEAQ 16(oup), oup
|
|
|
|
DECQ itr1
|
|
JG sealSSETail64LoopA
|
|
|
|
DECQ itr2
|
|
JGE sealSSETail64LoopB
|
|
PADDL ·chacha20Constants<>(SB), A1
|
|
PADDL state1Store, B1
|
|
PADDL state2Store, C1
|
|
PADDL ctr0Store, D1
|
|
|
|
JMP sealSSE128Seal
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 128 bytes of plaintext
|
|
sealSSETail128:
|
|
// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
|
|
MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
|
|
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
|
|
|
|
sealSSETail128LoopA:
|
|
// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
LEAQ 16(oup), oup
|
|
|
|
sealSSETail128LoopB:
|
|
chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
|
|
shiftB0Left; shiftC0Left; shiftD0Left
|
|
shiftB1Left; shiftC1Left; shiftD1Left
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
LEAQ 16(oup), oup
|
|
chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
|
|
shiftB0Right; shiftC0Right; shiftD0Right
|
|
shiftB1Right; shiftC1Right; shiftD1Right
|
|
|
|
DECQ itr1
|
|
JG sealSSETail128LoopA
|
|
|
|
DECQ itr2
|
|
JGE sealSSETail128LoopB
|
|
|
|
PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
|
|
PADDL state1Store, B0; PADDL state1Store, B1
|
|
PADDL state2Store, C0; PADDL state2Store, C1
|
|
PADDL ctr0Store, D0; PADDL ctr1Store, D1
|
|
|
|
MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
|
|
PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
|
|
MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
|
|
|
|
MOVQ $64, itr1
|
|
LEAQ 64(inp), inp
|
|
SUBQ $64, inl
|
|
|
|
JMP sealSSE128SealHash
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 192 bytes of plaintext
|
|
sealSSETail192:
|
|
// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
|
|
MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
|
|
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
|
|
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
|
|
|
|
sealSSETail192LoopA:
|
|
// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
LEAQ 16(oup), oup
|
|
|
|
sealSSETail192LoopB:
|
|
chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
|
|
shiftB0Left; shiftC0Left; shiftD0Left
|
|
shiftB1Left; shiftC1Left; shiftD1Left
|
|
shiftB2Left; shiftC2Left; shiftD2Left
|
|
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
LEAQ 16(oup), oup
|
|
|
|
chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
|
|
shiftB0Right; shiftC0Right; shiftD0Right
|
|
shiftB1Right; shiftC1Right; shiftD1Right
|
|
shiftB2Right; shiftC2Right; shiftD2Right
|
|
|
|
DECQ itr1
|
|
JG sealSSETail192LoopA
|
|
|
|
DECQ itr2
|
|
JGE sealSSETail192LoopB
|
|
|
|
PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
|
|
PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
|
|
PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
|
|
PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
|
|
|
|
MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
|
|
PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
|
|
MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
|
|
MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
|
|
PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
|
|
MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
|
|
|
|
MOVO A2, A1
|
|
MOVO B2, B1
|
|
MOVO C2, C1
|
|
MOVO D2, D1
|
|
MOVQ $128, itr1
|
|
LEAQ 128(inp), inp
|
|
SUBQ $128, inl
|
|
|
|
JMP sealSSE128SealHash
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special seal optimization for buffers smaller than 129 bytes
|
|
sealSSE128:
|
|
// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
|
|
MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
|
|
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
|
|
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
|
|
MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
|
|
MOVQ $10, itr2
|
|
|
|
sealSSE128InnerCipherLoop:
|
|
chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
|
|
shiftB0Left; shiftB1Left; shiftB2Left
|
|
shiftC0Left; shiftC1Left; shiftC2Left
|
|
shiftD0Left; shiftD1Left; shiftD2Left
|
|
chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
|
|
shiftB0Right; shiftB1Right; shiftB2Right
|
|
shiftC0Right; shiftC1Right; shiftC2Right
|
|
shiftD0Right; shiftD1Right; shiftD2Right
|
|
DECQ itr2
|
|
JNE sealSSE128InnerCipherLoop
|
|
|
|
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
|
|
PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
|
|
PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
|
|
PADDL T2, C1; PADDL T2, C2
|
|
PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
|
|
PAND ·polyClampMask<>(SB), A0
|
|
MOVOU A0, rStore
|
|
MOVOU B0, sStore
|
|
|
|
// Hash
|
|
MOVQ ad_len+80(FP), itr2
|
|
CALL polyHashADInternal<>(SB)
|
|
XORQ itr1, itr1
|
|
|
|
sealSSE128SealHash:
|
|
// itr1 holds the number of bytes encrypted but not yet hashed
|
|
CMPQ itr1, $16
|
|
JB sealSSE128Seal
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
|
|
SUBQ $16, itr1
|
|
ADDQ $16, oup
|
|
|
|
JMP sealSSE128SealHash
|
|
|
|
sealSSE128Seal:
|
|
CMPQ inl, $16
|
|
JB sealSSETail
|
|
SUBQ $16, inl
|
|
|
|
// Load for decryption
|
|
MOVOU (inp), T0
|
|
PXOR T0, A1
|
|
MOVOU A1, (oup)
|
|
LEAQ (1*16)(inp), inp
|
|
LEAQ (1*16)(oup), oup
|
|
|
|
// Extract for hashing
|
|
MOVQ A1, t0
|
|
PSRLDQ $8, A1
|
|
MOVQ A1, t1
|
|
ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
|
|
polyMul
|
|
|
|
// Shift the stream "left"
|
|
MOVO B1, A1
|
|
MOVO C1, B1
|
|
MOVO D1, C1
|
|
MOVO A2, D1
|
|
MOVO B2, A2
|
|
MOVO C2, B2
|
|
MOVO D2, C2
|
|
JMP sealSSE128Seal
|
|
|
|
sealSSETail:
|
|
TESTQ inl, inl
|
|
JE sealSSEFinalize
|
|
|
|
// We can only load the PT one byte at a time to avoid read after end of buffer
|
|
MOVQ inl, itr2
|
|
SHLQ $4, itr2
|
|
LEAQ ·andMask<>(SB), t0
|
|
MOVQ inl, itr1
|
|
LEAQ -1(inp)(inl*1), inp
|
|
XORQ t2, t2
|
|
XORQ t3, t3
|
|
XORQ AX, AX
|
|
|
|
sealSSETailLoadLoop:
|
|
SHLQ $8, t2, t3
|
|
SHLQ $8, t2
|
|
MOVB (inp), AX
|
|
XORQ AX, t2
|
|
LEAQ -1(inp), inp
|
|
DECQ itr1
|
|
JNE sealSSETailLoadLoop
|
|
MOVQ t2, 0+tmpStore
|
|
MOVQ t3, 8+tmpStore
|
|
PXOR 0+tmpStore, A1
|
|
MOVOU A1, (oup)
|
|
MOVOU -16(t0)(itr2*1), T0
|
|
PAND T0, A1
|
|
MOVQ A1, t0
|
|
PSRLDQ $8, A1
|
|
MOVQ A1, t1
|
|
ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
|
|
polyMul
|
|
|
|
ADDQ inl, oup
|
|
|
|
sealSSEFinalize:
|
|
// Hash in the buffer lengths
|
|
ADDQ ad_len+80(FP), acc0
|
|
ADCQ src_len+56(FP), acc1
|
|
ADCQ $1, acc2
|
|
polyMul
|
|
|
|
// Final reduce
|
|
MOVQ acc0, t0
|
|
MOVQ acc1, t1
|
|
MOVQ acc2, t2
|
|
SUBQ $-5, acc0
|
|
SBBQ $-1, acc1
|
|
SBBQ $3, acc2
|
|
CMOVQCS t0, acc0
|
|
CMOVQCS t1, acc1
|
|
CMOVQCS t2, acc2
|
|
|
|
// Add in the "s" part of the key
|
|
ADDQ 0+sStore, acc0
|
|
ADCQ 8+sStore, acc1
|
|
|
|
// Finally store the tag at the end of the message
|
|
MOVQ acc0, (0*8)(oup)
|
|
MOVQ acc1, (1*8)(oup)
|
|
RET
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// ------------------------- AVX2 Code ----------------------------------------
|
|
chacha20Poly1305Seal_AVX2:
|
|
VZEROUPPER
|
|
VMOVDQU ·chacha20Constants<>(SB), AA0
|
|
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
|
|
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
|
|
BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
|
|
VPADDD ·avx2InitMask<>(SB), DD0, DD0
|
|
|
|
// Special optimizations, for very short buffers
|
|
CMPQ inl, $192
|
|
JBE seal192AVX2 // 33% faster
|
|
CMPQ inl, $320
|
|
JBE seal320AVX2 // 17% faster
|
|
|
|
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
|
|
VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
|
|
VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
|
|
VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
|
|
VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
|
|
VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
|
|
VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
|
|
VMOVDQA DD3, ctr3StoreAVX2
|
|
MOVQ $10, itr2
|
|
|
|
sealAVX2IntroLoop:
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
VMOVDQA CC1, tmpStoreAVX2
|
|
chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
|
|
VMOVDQA tmpStoreAVX2, CC1
|
|
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
|
|
VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
|
|
VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
|
|
VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
|
|
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
VMOVDQA CC1, tmpStoreAVX2
|
|
chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
|
|
VMOVDQA tmpStoreAVX2, CC1
|
|
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
|
|
VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
|
|
VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
|
|
VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
|
|
DECQ itr2
|
|
JNE sealAVX2IntroLoop
|
|
|
|
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
|
|
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
|
|
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
|
|
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
|
|
|
|
VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
|
|
VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
|
|
VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
|
|
|
|
// Clamp and store poly key
|
|
VPAND ·polyClampMask<>(SB), DD0, DD0
|
|
VMOVDQA DD0, rsStoreAVX2
|
|
|
|
// Hash AD
|
|
MOVQ ad_len+80(FP), itr2
|
|
CALL polyHashADInternal<>(SB)
|
|
|
|
// Can store at least 320 bytes
|
|
VPXOR (0*32)(inp), AA0, AA0
|
|
VPXOR (1*32)(inp), CC0, CC0
|
|
VMOVDQU AA0, (0*32)(oup)
|
|
VMOVDQU CC0, (1*32)(oup)
|
|
|
|
VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
|
|
VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
|
|
VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
|
|
VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
|
|
VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
|
|
VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
|
|
|
|
MOVQ $320, itr1
|
|
SUBQ $320, inl
|
|
LEAQ 320(inp), inp
|
|
|
|
VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
|
|
CMPQ inl, $128
|
|
JBE sealAVX2SealHash
|
|
|
|
VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
|
|
VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
|
|
SUBQ $128, inl
|
|
LEAQ 128(inp), inp
|
|
|
|
MOVQ $8, itr1
|
|
MOVQ $2, itr2
|
|
|
|
CMPQ inl, $128
|
|
JBE sealAVX2Tail128
|
|
CMPQ inl, $256
|
|
JBE sealAVX2Tail256
|
|
CMPQ inl, $384
|
|
JBE sealAVX2Tail384
|
|
CMPQ inl, $512
|
|
JBE sealAVX2Tail512
|
|
|
|
// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
|
|
VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
|
|
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
|
|
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
|
|
VMOVDQA ctr3StoreAVX2, DD0
|
|
VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
|
|
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
|
|
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
VMOVDQA CC1, tmpStoreAVX2
|
|
chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
|
|
VMOVDQA tmpStoreAVX2, CC1
|
|
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
|
|
VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
|
|
VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
|
|
VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
|
|
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
VMOVDQA CC1, tmpStoreAVX2
|
|
chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
|
|
VMOVDQA tmpStoreAVX2, CC1
|
|
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
|
|
VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
|
|
VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
|
|
VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
|
|
SUBQ $16, oup // Adjust the pointer
|
|
MOVQ $9, itr1
|
|
JMP sealAVX2InternalLoopStart
|
|
|
|
sealAVX2MainLoop:
|
|
// Load state, increment counter blocks, store the incremented counters
|
|
VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
|
|
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
|
|
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
|
|
VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
|
|
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
|
|
MOVQ $10, itr1
|
|
|
|
sealAVX2InternalLoop:
|
|
polyAdd(0*8(oup))
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
polyMulStage1_AVX2
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
|
|
polyMulStage2_AVX2
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
polyMulStage3_AVX2
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
polyMulReduceStage
|
|
|
|
sealAVX2InternalLoopStart:
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
|
|
polyAdd(2*8(oup))
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
polyMulStage1_AVX2
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
polyMulStage2_AVX2
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
|
|
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
polyMulStage3_AVX2
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
|
|
polyMulReduceStage
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
polyAdd(4*8(oup))
|
|
LEAQ (6*8)(oup), oup
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
polyMulStage1_AVX2
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
polyMulStage2_AVX2
|
|
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
polyMulStage3_AVX2
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
polyMulReduceStage
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
|
|
VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
|
|
DECQ itr1
|
|
JNE sealAVX2InternalLoop
|
|
|
|
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
|
|
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
|
|
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
|
|
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
|
|
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
|
|
polyAdd(0*8(oup))
|
|
polyMulAVX2
|
|
LEAQ (4*8)(oup), oup
|
|
VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
|
|
VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
|
|
VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
|
|
VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
|
|
VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
|
|
VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
|
|
|
|
// and here
|
|
polyAdd(-2*8(oup))
|
|
polyMulAVX2
|
|
VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
|
|
VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
|
|
VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
|
|
VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
|
|
VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
|
|
VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
|
|
LEAQ (32*16)(inp), inp
|
|
SUBQ $(32*16), inl
|
|
CMPQ inl, $512
|
|
JG sealAVX2MainLoop
|
|
|
|
// Tail can only hash 480 bytes
|
|
polyAdd(0*8(oup))
|
|
polyMulAVX2
|
|
polyAdd(2*8(oup))
|
|
polyMulAVX2
|
|
LEAQ 32(oup), oup
|
|
|
|
MOVQ $10, itr1
|
|
MOVQ $0, itr2
|
|
CMPQ inl, $128
|
|
JBE sealAVX2Tail128
|
|
CMPQ inl, $256
|
|
JBE sealAVX2Tail256
|
|
CMPQ inl, $384
|
|
JBE sealAVX2Tail384
|
|
JMP sealAVX2Tail512
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for buffers smaller than 193 bytes
|
|
seal192AVX2:
|
|
// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
|
|
VMOVDQA AA0, AA1
|
|
VMOVDQA BB0, BB1
|
|
VMOVDQA CC0, CC1
|
|
VPADDD ·avx2IncMask<>(SB), DD0, DD1
|
|
VMOVDQA AA0, AA2
|
|
VMOVDQA BB0, BB2
|
|
VMOVDQA CC0, CC2
|
|
VMOVDQA DD0, DD2
|
|
VMOVDQA DD1, TT3
|
|
MOVQ $10, itr2
|
|
|
|
sealAVX2192InnerCipherLoop:
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
|
|
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
|
|
VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
|
|
DECQ itr2
|
|
JNE sealAVX2192InnerCipherLoop
|
|
VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
|
|
VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
|
|
VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
|
|
VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
|
|
VPERM2I128 $0x02, AA0, BB0, TT0
|
|
|
|
// Clamp and store poly key
|
|
VPAND ·polyClampMask<>(SB), TT0, TT0
|
|
VMOVDQA TT0, rsStoreAVX2
|
|
|
|
// Stream for up to 192 bytes
|
|
VPERM2I128 $0x13, AA0, BB0, AA0
|
|
VPERM2I128 $0x13, CC0, DD0, BB0
|
|
VPERM2I128 $0x02, AA1, BB1, CC0
|
|
VPERM2I128 $0x02, CC1, DD1, DD0
|
|
VPERM2I128 $0x13, AA1, BB1, AA1
|
|
VPERM2I128 $0x13, CC1, DD1, BB1
|
|
|
|
sealAVX2ShortSeal:
|
|
// Hash aad
|
|
MOVQ ad_len+80(FP), itr2
|
|
CALL polyHashADInternal<>(SB)
|
|
XORQ itr1, itr1
|
|
|
|
sealAVX2SealHash:
|
|
// itr1 holds the number of bytes encrypted but not yet hashed
|
|
CMPQ itr1, $16
|
|
JB sealAVX2ShortSealLoop
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
SUBQ $16, itr1
|
|
ADDQ $16, oup
|
|
JMP sealAVX2SealHash
|
|
|
|
sealAVX2ShortSealLoop:
|
|
CMPQ inl, $32
|
|
JB sealAVX2ShortTail32
|
|
SUBQ $32, inl
|
|
|
|
// Load for encryption
|
|
VPXOR (inp), AA0, AA0
|
|
VMOVDQU AA0, (oup)
|
|
LEAQ (1*32)(inp), inp
|
|
|
|
// Now can hash
|
|
polyAdd(0*8(oup))
|
|
polyMulAVX2
|
|
polyAdd(2*8(oup))
|
|
polyMulAVX2
|
|
LEAQ (1*32)(oup), oup
|
|
|
|
// Shift stream left
|
|
VMOVDQA BB0, AA0
|
|
VMOVDQA CC0, BB0
|
|
VMOVDQA DD0, CC0
|
|
VMOVDQA AA1, DD0
|
|
VMOVDQA BB1, AA1
|
|
VMOVDQA CC1, BB1
|
|
VMOVDQA DD1, CC1
|
|
VMOVDQA AA2, DD1
|
|
VMOVDQA BB2, AA2
|
|
JMP sealAVX2ShortSealLoop
|
|
|
|
sealAVX2ShortTail32:
|
|
CMPQ inl, $16
|
|
VMOVDQA A0, A1
|
|
JB sealAVX2ShortDone
|
|
|
|
SUBQ $16, inl
|
|
|
|
// Load for encryption
|
|
VPXOR (inp), A0, T0
|
|
VMOVDQU T0, (oup)
|
|
LEAQ (1*16)(inp), inp
|
|
|
|
// Hash
|
|
polyAdd(0*8(oup))
|
|
polyMulAVX2
|
|
LEAQ (1*16)(oup), oup
|
|
VPERM2I128 $0x11, AA0, AA0, AA0
|
|
VMOVDQA A0, A1
|
|
|
|
sealAVX2ShortDone:
|
|
VZEROUPPER
|
|
JMP sealSSETail
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for buffers smaller than 321 bytes
|
|
seal320AVX2:
|
|
// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
|
|
VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
|
|
VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
|
|
VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
|
|
MOVQ $10, itr2
|
|
|
|
sealAVX2320InnerCipherLoop:
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
|
|
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
|
|
VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
|
|
DECQ itr2
|
|
JNE sealAVX2320InnerCipherLoop
|
|
|
|
VMOVDQA ·chacha20Constants<>(SB), TT0
|
|
VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
|
|
VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
|
|
VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
|
|
VMOVDQA ·avx2IncMask<>(SB), TT0
|
|
VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
|
|
VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
|
|
VPADDD TT3, DD2, DD2
|
|
|
|
// Clamp and store poly key
|
|
VPERM2I128 $0x02, AA0, BB0, TT0
|
|
VPAND ·polyClampMask<>(SB), TT0, TT0
|
|
VMOVDQA TT0, rsStoreAVX2
|
|
|
|
// Stream for up to 320 bytes
|
|
VPERM2I128 $0x13, AA0, BB0, AA0
|
|
VPERM2I128 $0x13, CC0, DD0, BB0
|
|
VPERM2I128 $0x02, AA1, BB1, CC0
|
|
VPERM2I128 $0x02, CC1, DD1, DD0
|
|
VPERM2I128 $0x13, AA1, BB1, AA1
|
|
VPERM2I128 $0x13, CC1, DD1, BB1
|
|
VPERM2I128 $0x02, AA2, BB2, CC1
|
|
VPERM2I128 $0x02, CC2, DD2, DD1
|
|
VPERM2I128 $0x13, AA2, BB2, AA2
|
|
VPERM2I128 $0x13, CC2, DD2, BB2
|
|
JMP sealAVX2ShortSeal
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 128 bytes of ciphertext
|
|
sealAVX2Tail128:
|
|
// Need to decrypt up to 128 bytes - prepare two blocks
|
|
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
|
|
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
|
|
VMOVDQA ·chacha20Constants<>(SB), AA0
|
|
VMOVDQA state1StoreAVX2, BB0
|
|
VMOVDQA state2StoreAVX2, CC0
|
|
VMOVDQA ctr3StoreAVX2, DD0
|
|
VPADDD ·avx2IncMask<>(SB), DD0, DD0
|
|
VMOVDQA DD0, DD1
|
|
|
|
sealAVX2Tail128LoopA:
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
LEAQ 16(oup), oup
|
|
|
|
sealAVX2Tail128LoopB:
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
VPALIGNR $4, BB0, BB0, BB0
|
|
VPALIGNR $8, CC0, CC0, CC0
|
|
VPALIGNR $12, DD0, DD0, DD0
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
|
|
polyAdd(16(oup))
|
|
polyMul
|
|
LEAQ 32(oup), oup
|
|
VPALIGNR $12, BB0, BB0, BB0
|
|
VPALIGNR $8, CC0, CC0, CC0
|
|
VPALIGNR $4, DD0, DD0, DD0
|
|
DECQ itr1
|
|
JG sealAVX2Tail128LoopA
|
|
DECQ itr2
|
|
JGE sealAVX2Tail128LoopB
|
|
|
|
VPADDD ·chacha20Constants<>(SB), AA0, AA1
|
|
VPADDD state1StoreAVX2, BB0, BB1
|
|
VPADDD state2StoreAVX2, CC0, CC1
|
|
VPADDD DD1, DD0, DD1
|
|
|
|
VPERM2I128 $0x02, AA1, BB1, AA0
|
|
VPERM2I128 $0x02, CC1, DD1, BB0
|
|
VPERM2I128 $0x13, AA1, BB1, CC0
|
|
VPERM2I128 $0x13, CC1, DD1, DD0
|
|
JMP sealAVX2ShortSealLoop
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 256 bytes of ciphertext
|
|
sealAVX2Tail256:
|
|
// Need to decrypt up to 256 bytes - prepare two blocks
|
|
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
|
|
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
|
|
VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
|
|
VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
|
|
VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
|
|
VMOVDQA ctr3StoreAVX2, DD0
|
|
VPADDD ·avx2IncMask<>(SB), DD0, DD0
|
|
VPADDD ·avx2IncMask<>(SB), DD0, DD1
|
|
VMOVDQA DD0, TT1
|
|
VMOVDQA DD1, TT2
|
|
|
|
sealAVX2Tail256LoopA:
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
LEAQ 16(oup), oup
|
|
|
|
sealAVX2Tail256LoopB:
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
|
|
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
|
|
polyAdd(16(oup))
|
|
polyMul
|
|
LEAQ 32(oup), oup
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
|
|
VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
|
|
DECQ itr1
|
|
JG sealAVX2Tail256LoopA
|
|
DECQ itr2
|
|
JGE sealAVX2Tail256LoopB
|
|
|
|
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
|
|
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
|
|
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
|
|
VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
|
|
VPERM2I128 $0x02, AA0, BB0, TT0
|
|
VPERM2I128 $0x02, CC0, DD0, TT1
|
|
VPERM2I128 $0x13, AA0, BB0, TT2
|
|
VPERM2I128 $0x13, CC0, DD0, TT3
|
|
VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
|
|
VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
|
|
MOVQ $128, itr1
|
|
LEAQ 128(inp), inp
|
|
SUBQ $128, inl
|
|
VPERM2I128 $0x02, AA1, BB1, AA0
|
|
VPERM2I128 $0x02, CC1, DD1, BB0
|
|
VPERM2I128 $0x13, AA1, BB1, CC0
|
|
VPERM2I128 $0x13, CC1, DD1, DD0
|
|
|
|
JMP sealAVX2SealHash
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 384 bytes of ciphertext
|
|
sealAVX2Tail384:
|
|
// Need to decrypt up to 384 bytes - prepare two blocks
|
|
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
|
|
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
|
|
VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
|
|
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
|
|
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
|
|
VMOVDQA ctr3StoreAVX2, DD0
|
|
VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
|
|
VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
|
|
|
|
sealAVX2Tail384LoopA:
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
LEAQ 16(oup), oup
|
|
|
|
sealAVX2Tail384LoopB:
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
|
|
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
|
|
chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
|
|
polyAdd(16(oup))
|
|
polyMul
|
|
LEAQ 32(oup), oup
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
|
|
VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
|
|
DECQ itr1
|
|
JG sealAVX2Tail384LoopA
|
|
DECQ itr2
|
|
JGE sealAVX2Tail384LoopB
|
|
|
|
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
|
|
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
|
|
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
|
|
VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
|
|
VPERM2I128 $0x02, AA0, BB0, TT0
|
|
VPERM2I128 $0x02, CC0, DD0, TT1
|
|
VPERM2I128 $0x13, AA0, BB0, TT2
|
|
VPERM2I128 $0x13, CC0, DD0, TT3
|
|
VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
|
|
VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
|
|
VPERM2I128 $0x02, AA1, BB1, TT0
|
|
VPERM2I128 $0x02, CC1, DD1, TT1
|
|
VPERM2I128 $0x13, AA1, BB1, TT2
|
|
VPERM2I128 $0x13, CC1, DD1, TT3
|
|
VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
|
|
VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
|
|
MOVQ $256, itr1
|
|
LEAQ 256(inp), inp
|
|
SUBQ $256, inl
|
|
VPERM2I128 $0x02, AA2, BB2, AA0
|
|
VPERM2I128 $0x02, CC2, DD2, BB0
|
|
VPERM2I128 $0x13, AA2, BB2, CC0
|
|
VPERM2I128 $0x13, CC2, DD2, DD0
|
|
|
|
JMP sealAVX2SealHash
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Special optimization for the last 512 bytes of ciphertext
|
|
sealAVX2Tail512:
|
|
// Need to decrypt up to 512 bytes - prepare two blocks
|
|
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
|
|
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
|
|
VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
|
|
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
|
|
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
|
|
VMOVDQA ctr3StoreAVX2, DD0
|
|
VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
|
|
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
|
|
|
|
sealAVX2Tail512LoopA:
|
|
polyAdd(0(oup))
|
|
polyMul
|
|
LEAQ 16(oup), oup
|
|
|
|
sealAVX2Tail512LoopB:
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
polyAdd(0*8(oup))
|
|
polyMulAVX2
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
|
|
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
polyAdd(2*8(oup))
|
|
polyMulAVX2
|
|
LEAQ (4*8)(oup), oup
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
|
|
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
|
|
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
|
|
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
|
|
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
|
|
VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
|
|
VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
|
|
VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
|
|
VMOVDQA tmpStoreAVX2, CC3
|
|
VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
|
|
VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
|
|
VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
|
|
|
|
DECQ itr1
|
|
JG sealAVX2Tail512LoopA
|
|
DECQ itr2
|
|
JGE sealAVX2Tail512LoopB
|
|
|
|
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
|
|
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
|
|
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
|
|
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
|
|
VMOVDQA CC3, tmpStoreAVX2
|
|
VPERM2I128 $0x02, AA0, BB0, CC3
|
|
VPXOR (0*32)(inp), CC3, CC3
|
|
VMOVDQU CC3, (0*32)(oup)
|
|
VPERM2I128 $0x02, CC0, DD0, CC3
|
|
VPXOR (1*32)(inp), CC3, CC3
|
|
VMOVDQU CC3, (1*32)(oup)
|
|
VPERM2I128 $0x13, AA0, BB0, CC3
|
|
VPXOR (2*32)(inp), CC3, CC3
|
|
VMOVDQU CC3, (2*32)(oup)
|
|
VPERM2I128 $0x13, CC0, DD0, CC3
|
|
VPXOR (3*32)(inp), CC3, CC3
|
|
VMOVDQU CC3, (3*32)(oup)
|
|
|
|
VPERM2I128 $0x02, AA1, BB1, AA0
|
|
VPERM2I128 $0x02, CC1, DD1, BB0
|
|
VPERM2I128 $0x13, AA1, BB1, CC0
|
|
VPERM2I128 $0x13, CC1, DD1, DD0
|
|
VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
|
|
VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
|
|
|
|
VPERM2I128 $0x02, AA2, BB2, AA0
|
|
VPERM2I128 $0x02, CC2, DD2, BB0
|
|
VPERM2I128 $0x13, AA2, BB2, CC0
|
|
VPERM2I128 $0x13, CC2, DD2, DD0
|
|
VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
|
|
VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
|
|
|
|
MOVQ $384, itr1
|
|
LEAQ 384(inp), inp
|
|
SUBQ $384, inl
|
|
VPERM2I128 $0x02, AA3, BB3, AA0
|
|
VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
|
|
VPERM2I128 $0x13, AA3, BB3, CC0
|
|
VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
|
|
|
|
JMP sealAVX2SealHash
|
|
|
|
// func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
|
|
TEXT ·cpuid(SB), NOSPLIT, $0-24
|
|
MOVL eaxArg+0(FP), AX
|
|
MOVL ecxArg+4(FP), CX
|
|
CPUID
|
|
MOVL AX, eax+8(FP)
|
|
MOVL BX, ebx+12(FP)
|
|
MOVL CX, ecx+16(FP)
|
|
MOVL DX, edx+20(FP)
|
|
RET
|
|
|
|
// func xgetbv() (eax, edx uint32)
|
|
TEXT ·xgetbv(SB),NOSPLIT,$0-8
|
|
MOVL $0, CX
|
|
XGETBV
|
|
MOVL AX, eax+0(FP)
|
|
MOVL DX, edx+4(FP)
|
|
RET
|