751 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			751 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  * AES-NI + SSE2 implementation of AEGIS-128
 | |
|  *
 | |
|  * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
 | |
|  * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or modify it
 | |
|  * under the terms of the GNU General Public License version 2 as published
 | |
|  * by the Free Software Foundation.
 | |
|  */
 | |
| 
 | |
| #include <linux/linkage.h>
 | |
| #include <asm/frame.h>
 | |
| 
 | |
| #define STATE0	%xmm0
 | |
| #define STATE1	%xmm1
 | |
| #define STATE2	%xmm2
 | |
| #define STATE3	%xmm3
 | |
| #define STATE4	%xmm4
 | |
| #define KEY	%xmm5
 | |
| #define MSG	%xmm5
 | |
| #define T0	%xmm6
 | |
| #define T1	%xmm7
 | |
| 
 | |
| #define STATEP	%rdi
 | |
| #define LEN	%rsi
 | |
| #define SRC	%rdx
 | |
| #define DST	%rcx
 | |
| 
 | |
| .section .rodata.cst16.aegis128_const, "aM", @progbits, 32
 | |
| .align 16
 | |
| .Laegis128_const_0:
 | |
| 	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
 | |
| 	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
 | |
| .Laegis128_const_1:
 | |
| 	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
 | |
| 	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
 | |
| 
 | |
| .section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
 | |
| .align 16
 | |
| .Laegis128_counter:
 | |
| 	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
 | |
| 	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 | |
| 
 | |
| .text
 | |
| 
 | |
| /*
 | |
|  * aegis128_update
 | |
|  * input:
 | |
|  *   STATE[0-4] - input state
 | |
|  * output:
 | |
|  *   STATE[0-4] - output state (shifted positions)
 | |
|  * changed:
 | |
|  *   T0
 | |
|  */
 | |
| .macro aegis128_update
 | |
| 	movdqa STATE4, T0
 | |
| 	aesenc STATE0, STATE4
 | |
| 	aesenc STATE1, STATE0
 | |
| 	aesenc STATE2, STATE1
 | |
| 	aesenc STATE3, STATE2
 | |
| 	aesenc T0,     STATE3
 | |
| .endm
 | |
| 
 | |
| /*
 | |
|  * __load_partial: internal ABI
 | |
|  * input:
 | |
|  *   LEN - bytes
 | |
|  *   SRC - src
 | |
|  * output:
 | |
|  *   MSG  - message block
 | |
|  * changed:
 | |
|  *   T0
 | |
|  *   %r8
 | |
|  *   %r9
 | |
|  */
 | |
| __load_partial:
 | |
| 	xor %r9d, %r9d
 | |
| 	pxor MSG, MSG
 | |
| 
 | |
| 	mov LEN, %r8
 | |
| 	and $0x1, %r8
 | |
| 	jz .Lld_partial_1
 | |
| 
 | |
| 	mov LEN, %r8
 | |
| 	and $0x1E, %r8
 | |
| 	add SRC, %r8
 | |
| 	mov (%r8), %r9b
 | |
| 
 | |
| .Lld_partial_1:
 | |
| 	mov LEN, %r8
 | |
| 	and $0x2, %r8
 | |
| 	jz .Lld_partial_2
 | |
| 
 | |
| 	mov LEN, %r8
 | |
| 	and $0x1C, %r8
 | |
| 	add SRC, %r8
 | |
| 	shl $0x10, %r9
 | |
| 	mov (%r8), %r9w
 | |
| 
 | |
| .Lld_partial_2:
 | |
| 	mov LEN, %r8
 | |
| 	and $0x4, %r8
 | |
| 	jz .Lld_partial_4
 | |
| 
 | |
| 	mov LEN, %r8
 | |
| 	and $0x18, %r8
 | |
| 	add SRC, %r8
 | |
| 	shl $32, %r9
 | |
| 	mov (%r8), %r8d
 | |
| 	xor %r8, %r9
 | |
| 
 | |
| .Lld_partial_4:
 | |
| 	movq %r9, MSG
 | |
| 
 | |
| 	mov LEN, %r8
 | |
| 	and $0x8, %r8
 | |
| 	jz .Lld_partial_8
 | |
| 
 | |
| 	mov LEN, %r8
 | |
| 	and $0x10, %r8
 | |
| 	add SRC, %r8
 | |
| 	pslldq $8, MSG
 | |
| 	movq (%r8), T0
 | |
| 	pxor T0, MSG
 | |
| 
 | |
| .Lld_partial_8:
 | |
| 	ret
 | |
| ENDPROC(__load_partial)
 | |
| 
 | |
| /*
 | |
|  * __store_partial: internal ABI
 | |
|  * input:
 | |
|  *   LEN - bytes
 | |
|  *   DST - dst
 | |
|  * output:
 | |
|  *   T0   - message block
 | |
|  * changed:
 | |
|  *   %r8
 | |
|  *   %r9
 | |
|  *   %r10
 | |
|  */
 | |
| __store_partial:
 | |
| 	mov LEN, %r8
 | |
| 	mov DST, %r9
 | |
| 
 | |
| 	movq T0, %r10
 | |
| 
 | |
| 	cmp $8, %r8
 | |
| 	jl .Lst_partial_8
 | |
| 
 | |
| 	mov %r10, (%r9)
 | |
| 	psrldq $8, T0
 | |
| 	movq T0, %r10
 | |
| 
 | |
| 	sub $8, %r8
 | |
| 	add $8, %r9
 | |
| 
 | |
| .Lst_partial_8:
 | |
| 	cmp $4, %r8
 | |
| 	jl .Lst_partial_4
 | |
| 
 | |
| 	mov %r10d, (%r9)
 | |
| 	shr $32, %r10
 | |
| 
 | |
| 	sub $4, %r8
 | |
| 	add $4, %r9
 | |
| 
 | |
| .Lst_partial_4:
 | |
| 	cmp $2, %r8
 | |
| 	jl .Lst_partial_2
 | |
| 
 | |
| 	mov %r10w, (%r9)
 | |
| 	shr $0x10, %r10
 | |
| 
 | |
| 	sub $2, %r8
 | |
| 	add $2, %r9
 | |
| 
 | |
| .Lst_partial_2:
 | |
| 	cmp $1, %r8
 | |
| 	jl .Lst_partial_1
 | |
| 
 | |
| 	mov %r10b, (%r9)
 | |
| 
 | |
| .Lst_partial_1:
 | |
| 	ret
 | |
| ENDPROC(__store_partial)
 | |
| 
 | |
| /*
 | |
|  * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
 | |
|  */
 | |
| ENTRY(crypto_aegis128_aesni_init)
 | |
| 	FRAME_BEGIN
 | |
| 
 | |
| 	/* load IV: */
 | |
| 	movdqu (%rdx), T1
 | |
| 
 | |
| 	/* load key: */
 | |
| 	movdqa (%rsi), KEY
 | |
| 	pxor KEY, T1
 | |
| 	movdqa T1, STATE0
 | |
| 	movdqa KEY, STATE3
 | |
| 	movdqa KEY, STATE4
 | |
| 
 | |
| 	/* load the constants: */
 | |
| 	movdqa .Laegis128_const_0, STATE2
 | |
| 	movdqa .Laegis128_const_1, STATE1
 | |
| 	pxor STATE2, STATE3
 | |
| 	pxor STATE1, STATE4
 | |
| 
 | |
| 	/* update 10 times with KEY / KEY xor IV: */
 | |
| 	aegis128_update; pxor KEY, STATE4
 | |
| 	aegis128_update; pxor T1,  STATE3
 | |
| 	aegis128_update; pxor KEY, STATE2
 | |
| 	aegis128_update; pxor T1,  STATE1
 | |
| 	aegis128_update; pxor KEY, STATE0
 | |
| 	aegis128_update; pxor T1,  STATE4
 | |
| 	aegis128_update; pxor KEY, STATE3
 | |
| 	aegis128_update; pxor T1,  STATE2
 | |
| 	aegis128_update; pxor KEY, STATE1
 | |
| 	aegis128_update; pxor T1,  STATE0
 | |
| 
 | |
| 	/* store the state: */
 | |
| 	movdqu STATE0, 0x00(STATEP)
 | |
| 	movdqu STATE1, 0x10(STATEP)
 | |
| 	movdqu STATE2, 0x20(STATEP)
 | |
| 	movdqu STATE3, 0x30(STATEP)
 | |
| 	movdqu STATE4, 0x40(STATEP)
 | |
| 
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| ENDPROC(crypto_aegis128_aesni_init)
 | |
| 
 | |
| /*
 | |
|  * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
 | |
|  *                               const void *data);
 | |
|  */
 | |
| ENTRY(crypto_aegis128_aesni_ad)
 | |
| 	FRAME_BEGIN
 | |
| 
 | |
| 	cmp $0x10, LEN
 | |
| 	jb .Lad_out
 | |
| 
 | |
| 	/* load the state: */
 | |
| 	movdqu 0x00(STATEP), STATE0
 | |
| 	movdqu 0x10(STATEP), STATE1
 | |
| 	movdqu 0x20(STATEP), STATE2
 | |
| 	movdqu 0x30(STATEP), STATE3
 | |
| 	movdqu 0x40(STATEP), STATE4
 | |
| 
 | |
| 	mov SRC, %r8
 | |
| 	and $0xF, %r8
 | |
| 	jnz .Lad_u_loop
 | |
| 
 | |
| .align 8
 | |
| .Lad_a_loop:
 | |
| 	movdqa 0x00(SRC), MSG
 | |
| 	aegis128_update
 | |
| 	pxor MSG, STATE4
 | |
| 	sub $0x10, LEN
 | |
| 	cmp $0x10, LEN
 | |
| 	jl .Lad_out_1
 | |
| 
 | |
| 	movdqa 0x10(SRC), MSG
 | |
| 	aegis128_update
 | |
| 	pxor MSG, STATE3
 | |
| 	sub $0x10, LEN
 | |
| 	cmp $0x10, LEN
 | |
| 	jl .Lad_out_2
 | |
| 
 | |
| 	movdqa 0x20(SRC), MSG
 | |
| 	aegis128_update
 | |
| 	pxor MSG, STATE2
 | |
| 	sub $0x10, LEN
 | |
| 	cmp $0x10, LEN
 | |
| 	jl .Lad_out_3
 | |
| 
 | |
| 	movdqa 0x30(SRC), MSG
 | |
| 	aegis128_update
 | |
| 	pxor MSG, STATE1
 | |
| 	sub $0x10, LEN
 | |
| 	cmp $0x10, LEN
 | |
| 	jl .Lad_out_4
 | |
| 
 | |
| 	movdqa 0x40(SRC), MSG
 | |
| 	aegis128_update
 | |
| 	pxor MSG, STATE0
 | |
| 	sub $0x10, LEN
 | |
| 	cmp $0x10, LEN
 | |
| 	jl .Lad_out_0
 | |
| 
 | |
| 	add $0x50, SRC
 | |
| 	jmp .Lad_a_loop
 | |
| 
 | |
| .align 8
 | |
| .Lad_u_loop:
 | |
| 	movdqu 0x00(SRC), MSG
 | |
| 	aegis128_update
 | |
| 	pxor MSG, STATE4
 | |
| 	sub $0x10, LEN
 | |
| 	cmp $0x10, LEN
 | |
| 	jl .Lad_out_1
 | |
| 
 | |
| 	movdqu 0x10(SRC), MSG
 | |
| 	aegis128_update
 | |
| 	pxor MSG, STATE3
 | |
| 	sub $0x10, LEN
 | |
| 	cmp $0x10, LEN
 | |
| 	jl .Lad_out_2
 | |
| 
 | |
| 	movdqu 0x20(SRC), MSG
 | |
| 	aegis128_update
 | |
| 	pxor MSG, STATE2
 | |
| 	sub $0x10, LEN
 | |
| 	cmp $0x10, LEN
 | |
| 	jl .Lad_out_3
 | |
| 
 | |
| 	movdqu 0x30(SRC), MSG
 | |
| 	aegis128_update
 | |
| 	pxor MSG, STATE1
 | |
| 	sub $0x10, LEN
 | |
| 	cmp $0x10, LEN
 | |
| 	jl .Lad_out_4
 | |
| 
 | |
| 	movdqu 0x40(SRC), MSG
 | |
| 	aegis128_update
 | |
| 	pxor MSG, STATE0
 | |
| 	sub $0x10, LEN
 | |
| 	cmp $0x10, LEN
 | |
| 	jl .Lad_out_0
 | |
| 
 | |
| 	add $0x50, SRC
 | |
| 	jmp .Lad_u_loop
 | |
| 
 | |
| 	/* store the state: */
 | |
| .Lad_out_0:
 | |
| 	movdqu STATE0, 0x00(STATEP)
 | |
| 	movdqu STATE1, 0x10(STATEP)
 | |
| 	movdqu STATE2, 0x20(STATEP)
 | |
| 	movdqu STATE3, 0x30(STATEP)
 | |
| 	movdqu STATE4, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Lad_out_1:
 | |
| 	movdqu STATE4, 0x00(STATEP)
 | |
| 	movdqu STATE0, 0x10(STATEP)
 | |
| 	movdqu STATE1, 0x20(STATEP)
 | |
| 	movdqu STATE2, 0x30(STATEP)
 | |
| 	movdqu STATE3, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Lad_out_2:
 | |
| 	movdqu STATE3, 0x00(STATEP)
 | |
| 	movdqu STATE4, 0x10(STATEP)
 | |
| 	movdqu STATE0, 0x20(STATEP)
 | |
| 	movdqu STATE1, 0x30(STATEP)
 | |
| 	movdqu STATE2, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Lad_out_3:
 | |
| 	movdqu STATE2, 0x00(STATEP)
 | |
| 	movdqu STATE3, 0x10(STATEP)
 | |
| 	movdqu STATE4, 0x20(STATEP)
 | |
| 	movdqu STATE0, 0x30(STATEP)
 | |
| 	movdqu STATE1, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Lad_out_4:
 | |
| 	movdqu STATE1, 0x00(STATEP)
 | |
| 	movdqu STATE2, 0x10(STATEP)
 | |
| 	movdqu STATE3, 0x20(STATEP)
 | |
| 	movdqu STATE4, 0x30(STATEP)
 | |
| 	movdqu STATE0, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Lad_out:
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| ENDPROC(crypto_aegis128_aesni_ad)
 | |
| 
 | |
| .macro encrypt_block a s0 s1 s2 s3 s4 i
 | |
| 	movdq\a (\i * 0x10)(SRC), MSG
 | |
| 	movdqa MSG, T0
 | |
| 	pxor \s1, T0
 | |
| 	pxor \s4, T0
 | |
| 	movdqa \s2, T1
 | |
| 	pand \s3, T1
 | |
| 	pxor T1, T0
 | |
| 	movdq\a T0, (\i * 0x10)(DST)
 | |
| 
 | |
| 	aegis128_update
 | |
| 	pxor MSG, \s4
 | |
| 
 | |
| 	sub $0x10, LEN
 | |
| 	cmp $0x10, LEN
 | |
| 	jl .Lenc_out_\i
 | |
| .endm
 | |
| 
 | |
| /*
 | |
|  * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
 | |
|  *                                const void *src, void *dst);
 | |
|  */
 | |
| ENTRY(crypto_aegis128_aesni_enc)
 | |
| 	FRAME_BEGIN
 | |
| 
 | |
| 	cmp $0x10, LEN
 | |
| 	jb .Lenc_out
 | |
| 
 | |
| 	/* load the state: */
 | |
| 	movdqu 0x00(STATEP), STATE0
 | |
| 	movdqu 0x10(STATEP), STATE1
 | |
| 	movdqu 0x20(STATEP), STATE2
 | |
| 	movdqu 0x30(STATEP), STATE3
 | |
| 	movdqu 0x40(STATEP), STATE4
 | |
| 
 | |
| 	mov  SRC,  %r8
 | |
| 	or   DST,  %r8
 | |
| 	and $0xF, %r8
 | |
| 	jnz .Lenc_u_loop
 | |
| 
 | |
| .align 8
 | |
| .Lenc_a_loop:
 | |
| 	encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
 | |
| 	encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
 | |
| 	encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
 | |
| 	encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
 | |
| 	encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
 | |
| 
 | |
| 	add $0x50, SRC
 | |
| 	add $0x50, DST
 | |
| 	jmp .Lenc_a_loop
 | |
| 
 | |
| .align 8
 | |
| .Lenc_u_loop:
 | |
| 	encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
 | |
| 	encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
 | |
| 	encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
 | |
| 	encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
 | |
| 	encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
 | |
| 
 | |
| 	add $0x50, SRC
 | |
| 	add $0x50, DST
 | |
| 	jmp .Lenc_u_loop
 | |
| 
 | |
| 	/* store the state: */
 | |
| .Lenc_out_0:
 | |
| 	movdqu STATE4, 0x00(STATEP)
 | |
| 	movdqu STATE0, 0x10(STATEP)
 | |
| 	movdqu STATE1, 0x20(STATEP)
 | |
| 	movdqu STATE2, 0x30(STATEP)
 | |
| 	movdqu STATE3, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Lenc_out_1:
 | |
| 	movdqu STATE3, 0x00(STATEP)
 | |
| 	movdqu STATE4, 0x10(STATEP)
 | |
| 	movdqu STATE0, 0x20(STATEP)
 | |
| 	movdqu STATE1, 0x30(STATEP)
 | |
| 	movdqu STATE2, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Lenc_out_2:
 | |
| 	movdqu STATE2, 0x00(STATEP)
 | |
| 	movdqu STATE3, 0x10(STATEP)
 | |
| 	movdqu STATE4, 0x20(STATEP)
 | |
| 	movdqu STATE0, 0x30(STATEP)
 | |
| 	movdqu STATE1, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Lenc_out_3:
 | |
| 	movdqu STATE1, 0x00(STATEP)
 | |
| 	movdqu STATE2, 0x10(STATEP)
 | |
| 	movdqu STATE3, 0x20(STATEP)
 | |
| 	movdqu STATE4, 0x30(STATEP)
 | |
| 	movdqu STATE0, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Lenc_out_4:
 | |
| 	movdqu STATE0, 0x00(STATEP)
 | |
| 	movdqu STATE1, 0x10(STATEP)
 | |
| 	movdqu STATE2, 0x20(STATEP)
 | |
| 	movdqu STATE3, 0x30(STATEP)
 | |
| 	movdqu STATE4, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Lenc_out:
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| ENDPROC(crypto_aegis128_aesni_enc)
 | |
| 
 | |
| /*
 | |
|  * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
 | |
|  *                                     const void *src, void *dst);
 | |
|  */
 | |
| ENTRY(crypto_aegis128_aesni_enc_tail)
 | |
| 	FRAME_BEGIN
 | |
| 
 | |
| 	/* load the state: */
 | |
| 	movdqu 0x00(STATEP), STATE0
 | |
| 	movdqu 0x10(STATEP), STATE1
 | |
| 	movdqu 0x20(STATEP), STATE2
 | |
| 	movdqu 0x30(STATEP), STATE3
 | |
| 	movdqu 0x40(STATEP), STATE4
 | |
| 
 | |
| 	/* encrypt message: */
 | |
| 	call __load_partial
 | |
| 
 | |
| 	movdqa MSG, T0
 | |
| 	pxor STATE1, T0
 | |
| 	pxor STATE4, T0
 | |
| 	movdqa STATE2, T1
 | |
| 	pand STATE3, T1
 | |
| 	pxor T1, T0
 | |
| 
 | |
| 	call __store_partial
 | |
| 
 | |
| 	aegis128_update
 | |
| 	pxor MSG, STATE4
 | |
| 
 | |
| 	/* store the state: */
 | |
| 	movdqu STATE4, 0x00(STATEP)
 | |
| 	movdqu STATE0, 0x10(STATEP)
 | |
| 	movdqu STATE1, 0x20(STATEP)
 | |
| 	movdqu STATE2, 0x30(STATEP)
 | |
| 	movdqu STATE3, 0x40(STATEP)
 | |
| 
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| ENDPROC(crypto_aegis128_aesni_enc_tail)
 | |
| 
 | |
| .macro decrypt_block a s0 s1 s2 s3 s4 i
 | |
| 	movdq\a (\i * 0x10)(SRC), MSG
 | |
| 	pxor \s1, MSG
 | |
| 	pxor \s4, MSG
 | |
| 	movdqa \s2, T1
 | |
| 	pand \s3, T1
 | |
| 	pxor T1, MSG
 | |
| 	movdq\a MSG, (\i * 0x10)(DST)
 | |
| 
 | |
| 	aegis128_update
 | |
| 	pxor MSG, \s4
 | |
| 
 | |
| 	sub $0x10, LEN
 | |
| 	cmp $0x10, LEN
 | |
| 	jl .Ldec_out_\i
 | |
| .endm
 | |
| 
 | |
| /*
 | |
|  * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
 | |
|  *                                const void *src, void *dst);
 | |
|  */
 | |
| ENTRY(crypto_aegis128_aesni_dec)
 | |
| 	FRAME_BEGIN
 | |
| 
 | |
| 	cmp $0x10, LEN
 | |
| 	jb .Ldec_out
 | |
| 
 | |
| 	/* load the state: */
 | |
| 	movdqu 0x00(STATEP), STATE0
 | |
| 	movdqu 0x10(STATEP), STATE1
 | |
| 	movdqu 0x20(STATEP), STATE2
 | |
| 	movdqu 0x30(STATEP), STATE3
 | |
| 	movdqu 0x40(STATEP), STATE4
 | |
| 
 | |
| 	mov  SRC, %r8
 | |
| 	or   DST, %r8
 | |
| 	and $0xF, %r8
 | |
| 	jnz .Ldec_u_loop
 | |
| 
 | |
| .align 8
 | |
| .Ldec_a_loop:
 | |
| 	decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
 | |
| 	decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
 | |
| 	decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
 | |
| 	decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
 | |
| 	decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
 | |
| 
 | |
| 	add $0x50, SRC
 | |
| 	add $0x50, DST
 | |
| 	jmp .Ldec_a_loop
 | |
| 
 | |
| .align 8
 | |
| .Ldec_u_loop:
 | |
| 	decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
 | |
| 	decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
 | |
| 	decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
 | |
| 	decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
 | |
| 	decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
 | |
| 
 | |
| 	add $0x50, SRC
 | |
| 	add $0x50, DST
 | |
| 	jmp .Ldec_u_loop
 | |
| 
 | |
| 	/* store the state: */
 | |
| .Ldec_out_0:
 | |
| 	movdqu STATE4, 0x00(STATEP)
 | |
| 	movdqu STATE0, 0x10(STATEP)
 | |
| 	movdqu STATE1, 0x20(STATEP)
 | |
| 	movdqu STATE2, 0x30(STATEP)
 | |
| 	movdqu STATE3, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Ldec_out_1:
 | |
| 	movdqu STATE3, 0x00(STATEP)
 | |
| 	movdqu STATE4, 0x10(STATEP)
 | |
| 	movdqu STATE0, 0x20(STATEP)
 | |
| 	movdqu STATE1, 0x30(STATEP)
 | |
| 	movdqu STATE2, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Ldec_out_2:
 | |
| 	movdqu STATE2, 0x00(STATEP)
 | |
| 	movdqu STATE3, 0x10(STATEP)
 | |
| 	movdqu STATE4, 0x20(STATEP)
 | |
| 	movdqu STATE0, 0x30(STATEP)
 | |
| 	movdqu STATE1, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Ldec_out_3:
 | |
| 	movdqu STATE1, 0x00(STATEP)
 | |
| 	movdqu STATE2, 0x10(STATEP)
 | |
| 	movdqu STATE3, 0x20(STATEP)
 | |
| 	movdqu STATE4, 0x30(STATEP)
 | |
| 	movdqu STATE0, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Ldec_out_4:
 | |
| 	movdqu STATE0, 0x00(STATEP)
 | |
| 	movdqu STATE1, 0x10(STATEP)
 | |
| 	movdqu STATE2, 0x20(STATEP)
 | |
| 	movdqu STATE3, 0x30(STATEP)
 | |
| 	movdqu STATE4, 0x40(STATEP)
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| 
 | |
| .Ldec_out:
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| ENDPROC(crypto_aegis128_aesni_dec)
 | |
| 
 | |
| /*
 | |
|  * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
 | |
|  *                                     const void *src, void *dst);
 | |
|  */
 | |
| ENTRY(crypto_aegis128_aesni_dec_tail)
 | |
| 	FRAME_BEGIN
 | |
| 
 | |
| 	/* load the state: */
 | |
| 	movdqu 0x00(STATEP), STATE0
 | |
| 	movdqu 0x10(STATEP), STATE1
 | |
| 	movdqu 0x20(STATEP), STATE2
 | |
| 	movdqu 0x30(STATEP), STATE3
 | |
| 	movdqu 0x40(STATEP), STATE4
 | |
| 
 | |
| 	/* decrypt message: */
 | |
| 	call __load_partial
 | |
| 
 | |
| 	pxor STATE1, MSG
 | |
| 	pxor STATE4, MSG
 | |
| 	movdqa STATE2, T1
 | |
| 	pand STATE3, T1
 | |
| 	pxor T1, MSG
 | |
| 
 | |
| 	movdqa MSG, T0
 | |
| 	call __store_partial
 | |
| 
 | |
| 	/* mask with byte count: */
 | |
| 	movq LEN, T0
 | |
| 	punpcklbw T0, T0
 | |
| 	punpcklbw T0, T0
 | |
| 	punpcklbw T0, T0
 | |
| 	punpcklbw T0, T0
 | |
| 	movdqa .Laegis128_counter, T1
 | |
| 	pcmpgtb T1, T0
 | |
| 	pand T0, MSG
 | |
| 
 | |
| 	aegis128_update
 | |
| 	pxor MSG, STATE4
 | |
| 
 | |
| 	/* store the state: */
 | |
| 	movdqu STATE4, 0x00(STATEP)
 | |
| 	movdqu STATE0, 0x10(STATEP)
 | |
| 	movdqu STATE1, 0x20(STATEP)
 | |
| 	movdqu STATE2, 0x30(STATEP)
 | |
| 	movdqu STATE3, 0x40(STATEP)
 | |
| 
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| ENDPROC(crypto_aegis128_aesni_dec_tail)
 | |
| 
 | |
| /*
 | |
|  * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
 | |
|  *                                  u64 assoclen, u64 cryptlen);
 | |
|  */
 | |
| ENTRY(crypto_aegis128_aesni_final)
 | |
| 	FRAME_BEGIN
 | |
| 
 | |
| 	/* load the state: */
 | |
| 	movdqu 0x00(STATEP), STATE0
 | |
| 	movdqu 0x10(STATEP), STATE1
 | |
| 	movdqu 0x20(STATEP), STATE2
 | |
| 	movdqu 0x30(STATEP), STATE3
 | |
| 	movdqu 0x40(STATEP), STATE4
 | |
| 
 | |
| 	/* prepare length block: */
 | |
| 	movq %rdx, MSG
 | |
| 	movq %rcx, T0
 | |
| 	pslldq $8, T0
 | |
| 	pxor T0, MSG
 | |
| 	psllq $3, MSG /* multiply by 8 (to get bit count) */
 | |
| 
 | |
| 	pxor STATE3, MSG
 | |
| 
 | |
| 	/* update state: */
 | |
| 	aegis128_update; pxor MSG, STATE4
 | |
| 	aegis128_update; pxor MSG, STATE3
 | |
| 	aegis128_update; pxor MSG, STATE2
 | |
| 	aegis128_update; pxor MSG, STATE1
 | |
| 	aegis128_update; pxor MSG, STATE0
 | |
| 	aegis128_update; pxor MSG, STATE4
 | |
| 	aegis128_update; pxor MSG, STATE3
 | |
| 
 | |
| 	/* xor tag: */
 | |
| 	movdqu (%rsi), MSG
 | |
| 
 | |
| 	pxor STATE0, MSG
 | |
| 	pxor STATE1, MSG
 | |
| 	pxor STATE2, MSG
 | |
| 	pxor STATE3, MSG
 | |
| 	pxor STATE4, MSG
 | |
| 
 | |
| 	movdqu MSG, (%rsi)
 | |
| 
 | |
| 	FRAME_END
 | |
| 	ret
 | |
| ENDPROC(crypto_aegis128_aesni_final)
 | 
