--- crypto/openssl/crypto/bn/asm/rsaz-avx2.pl.orig +++ crypto/openssl/crypto/bn/asm/rsaz-avx2.pl @@ -239,7 +239,7 @@ vmovdqu 32*8-128($ap), $ACC8 lea 192(%rsp), $tp0 # 64+128=192 - vpbroadcastq .Land_mask(%rip), $AND_MASK + vmovdqu .Land_mask(%rip), $AND_MASK jmp .LOOP_GRANDE_SQR_1024 .align 32 @@ -1070,10 +1070,10 @@ vpmuludq 32*6-128($np),$Yi,$TEMP1 vpaddq $TEMP1,$ACC6,$ACC6 vpmuludq 32*7-128($np),$Yi,$TEMP2 - vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3 + vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3 vpaddq $TEMP2,$ACC7,$ACC7 vpmuludq 32*8-128($np),$Yi,$TEMP0 - vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3 + vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3 vpaddq $TEMP0,$ACC8,$ACC8 mov %rbx, %rax @@ -1086,7 +1086,9 @@ vmovdqu -8+32*2-128($ap),$TEMP2 mov $r1, %rax + vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3 imull $n0, %eax + vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3 and \$0x1fffffff, %eax imulq 16-128($ap),%rbx @@ -1322,15 +1324,12 @@ # But as we underutilize resources, it's possible to correct in # each iteration with marginal performance loss. But then, as # we do it in each iteration, we can correct less digits, and -# avoid performance penalties completely. Also note that we -# correct only three digits out of four. This works because -# most significant digit is subjected to less additions. +# avoid performance penalties completely. $TEMP0 = $ACC9; $TEMP3 = $Bi; $TEMP4 = $Yi; $code.=<<___; - vpermq \$0, $AND_MASK, $AND_MASK vpaddq (%rsp), $TEMP1, $ACC0 vpsrlq \$29, $ACC0, $TEMP1 @@ -1763,7 +1762,7 @@ .align 64 .Land_mask: - .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 + .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff .Lscatter_permd: .long 0,2,4,6,7,7,7,7 .Lgather_permd: --- crypto/openssl/ssl/ssl.h.orig +++ crypto/openssl/ssl/ssl.h @@ -1727,7 +1727,7 @@ # define SSL_ST_BEFORE 0x4000 # define SSL_ST_OK 0x03 # define SSL_ST_RENEGOTIATE (0x04|SSL_ST_INIT) -# define SSL_ST_ERR 0x05 +# define SSL_ST_ERR (0x05|SSL_ST_INIT) # define SSL_CB_LOOP 0x01 # define SSL_CB_EXIT 0x02 --- secure/lib/libcrypto/amd64/rsaz-avx2.S.orig +++ secure/lib/libcrypto/amd64/rsaz-avx2.S @@ -68,7 +68,7 @@ vmovdqu 256-128(%rsi),%ymm8 leaq 192(%rsp),%rbx - vpbroadcastq .Land_mask(%rip),%ymm15 + vmovdqu .Land_mask(%rip),%ymm15 jmp .LOOP_GRANDE_SQR_1024 .align 32 @@ -801,10 +801,10 @@ vpmuludq 192-128(%rcx),%ymm11,%ymm12 vpaddq %ymm12,%ymm6,%ymm6 vpmuludq 224-128(%rcx),%ymm11,%ymm13 - vpblendd $3,%ymm14,%ymm9,%ymm9 + vpblendd $3,%ymm14,%ymm9,%ymm12 vpaddq %ymm13,%ymm7,%ymm7 vpmuludq 256-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm9,%ymm3,%ymm3 + vpaddq %ymm12,%ymm3,%ymm3 vpaddq %ymm0,%ymm8,%ymm8 movq %rbx,%rax @@ -817,7 +817,9 @@ vmovdqu -8+64-128(%rsi),%ymm13 movq %r10,%rax + vpblendd $0xfc,%ymm14,%ymm9,%ymm9 imull %r8d,%eax + vpaddq %ymm9,%ymm4,%ymm4 andl $0x1fffffff,%eax imulq 16-128(%rsi),%rbx @@ -1046,7 +1048,6 @@ decl %r14d jnz .Loop_mul_1024 - vpermq $0,%ymm15,%ymm15 vpaddq (%rsp),%ymm12,%ymm0 vpsrlq $29,%ymm0,%ymm12 @@ -1686,7 +1687,7 @@ .align 64 .Land_mask: -.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 +.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff .Lscatter_permd: .long 0,2,4,6,7,7,7,7 .Lgather_permd: