Merge remote-tracking branch 'origin/master'

# Conflicts: # README.md
2026-06-29 09:13:38 +08:00 · 2017-02-14 16:12:29 +08:00
parent d2254170b8
commit 43fed1108d
3503 changed files with 320546 additions and 408546 deletions
--- a/crypto/modes/Makefile
+++ b/crypto/modes/Makefile
@@ -1,160 +0,0 @@
-#
-# OpenSSL/crypto/modes/Makefile
-#
-
-DIR=	modes
-TOP=	../..
-CC=	cc
-INCLUDES= -I.. -I$(TOP) -I../../include
-CFLAG=-g
-MAKEFILE=	Makefile
-AR=		ar r
-
-MODES_ASM_OBJ=
-
-CFLAGS= $(INCLUDES) $(CFLAG)
-ASFLAGS= $(INCLUDES) $(ASFLAG)
-AFLAGS= $(ASFLAGS)
-
-GENERAL=Makefile
-TEST=
-APPS=
-
-LIB=$(TOP)/libcrypto.a
-LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
-	ccm128.c xts128.c wrap128.c
-LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o \
-	ccm128.o xts128.o wrap128.o $(MODES_ASM_OBJ)
-
-SRC= $(LIBSRC)
-
-#EXHEADER= store.h str_compat.h
-EXHEADER= modes.h
-HEADER=	modes_lcl.h $(EXHEADER)
-
-ALL=    $(GENERAL) $(SRC) $(HEADER)
-
-top:
-	(cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
-
-all:	lib
-
-lib:	$(LIBOBJ)
-	$(AR) $(LIB) $(LIBOBJ)
-	$(RANLIB) $(LIB) || echo Never mind.
-	@touch lib
-
-ghash-ia64.s:	asm/ghash-ia64.pl
-	$(PERL) asm/ghash-ia64.pl $@ $(CFLAGS)
-ghash-x86.s:	asm/ghash-x86.pl
-	$(PERL) asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
-ghash-x86_64.s:	asm/ghash-x86_64.pl
-	$(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@
-aesni-gcm-x86_64.s:	asm/aesni-gcm-x86_64.pl
-	$(PERL) asm/aesni-gcm-x86_64.pl $(PERLASM_SCHEME) > $@
-ghash-sparcv9.s:	asm/ghash-sparcv9.pl
-	$(PERL) asm/ghash-sparcv9.pl $@ $(CFLAGS)
-ghash-alpha.s:	asm/ghash-alpha.pl
-	(preproc=$$$$.$@.S; trap "rm $$preproc" INT; \
-	$(PERL) asm/ghash-alpha.pl > $$preproc && \
-	$(CC) -E -P $$preproc > $@ && rm $$preproc)
-ghash-parisc.s:	asm/ghash-parisc.pl
-	$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
-ghashv8-armx.S:	asm/ghashv8-armx.pl
-	$(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@
-ghashp8-ppc.s:	asm/ghashp8-ppc.pl
-	$(PERL) asm/ghashp8-ppc.pl $(PERLASM_SCHEME) $@
-
-# GNU make "catch all"
-ghash-%.S:	asm/ghash-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
-
-ghash-armv4.o:	ghash-armv4.S
-ghashv8-armx.o:	ghashv8-armx.S
-
-files:
-	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
-
-links:
-	@$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
-	@$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
-	@$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
-
-install:
-	@[ -n "$(INSTALLTOP)" ] # should be set by top Makefile...
-	@headerlist="$(EXHEADER)"; for i in $$headerlist; \
-	do  \
-	(cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
-	chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
-	done;
-
-tags:
-	ctags $(SRC)
-
-tests:
-
-lint:
-	lint -DLINT $(INCLUDES) $(SRC)>fluff
-
-update: depend
-
-depend:
-	@[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile...
-	$(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
-
-dclean:
-	$(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
-	mv -f Makefile.new $(MAKEFILE)
-
-clean:
-	rm -f *.s *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
-
-# DO NOT DELETE THIS LINE -- make depend depends on it.
-
-cbc128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-cbc128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-cbc128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-cbc128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-cbc128.o: ../../include/openssl/symhacks.h cbc128.c modes_lcl.h
-ccm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-ccm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-ccm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-ccm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-ccm128.o: ../../include/openssl/symhacks.h ccm128.c modes_lcl.h
-cfb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-cfb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-cfb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-cfb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-cfb128.o: ../../include/openssl/symhacks.h cfb128.c modes_lcl.h
-ctr128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-ctr128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-ctr128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-ctr128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-ctr128.o: ../../include/openssl/symhacks.h ctr128.c modes_lcl.h
-cts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-cts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-cts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-cts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-cts128.o: ../../include/openssl/symhacks.h cts128.c modes_lcl.h
-gcm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-gcm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-gcm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-gcm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-gcm128.o: ../../include/openssl/symhacks.h gcm128.c modes_lcl.h
-ofb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-ofb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-ofb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-ofb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-ofb128.o: ../../include/openssl/symhacks.h modes_lcl.h ofb128.c
-wrap128.o: ../../e_os.h ../../include/openssl/bio.h
-wrap128.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-wrap128.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-wrap128.o: ../../include/openssl/lhash.h ../../include/openssl/modes.h
-wrap128.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
-wrap128.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
-wrap128.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-wrap128.o: ../cryptlib.h wrap128.c
-xts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-xts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-xts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-xts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-xts128.o: ../../include/openssl/symhacks.h modes_lcl.h xts128.c
--- a/crypto/modes/Makefile.save
+++ b/crypto/modes/Makefile.save
@@ -1,160 +0,0 @@
-#
-# OpenSSL/crypto/modes/Makefile
-#
-
-DIR=	modes
-TOP=	../..
-CC=	cc
-INCLUDES= -I.. -I$(TOP) -I../../include
-CFLAG=-g
-MAKEFILE=	Makefile
-AR=		ar r
-
-MODES_ASM_OBJ=
-
-CFLAGS= $(INCLUDES) $(CFLAG)
-ASFLAGS= $(INCLUDES) $(ASFLAG)
-AFLAGS= $(ASFLAGS)
-
-GENERAL=Makefile
-TEST=
-APPS=
-
-LIB=$(TOP)/libcrypto.a
-LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
-	ccm128.c xts128.c wrap128.c
-LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o \
-	ccm128.o xts128.o wrap128.o $(MODES_ASM_OBJ)
-
-SRC= $(LIBSRC)
-
-#EXHEADER= store.h str_compat.h
-EXHEADER= modes.h
-HEADER=	modes_lcl.h $(EXHEADER)
-
-ALL=    $(GENERAL) $(SRC) $(HEADER)
-
-top:
-	(cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
-
-all:	lib
-
-lib:	$(LIBOBJ)
-	$(AR) $(LIB) $(LIBOBJ)
-	$(RANLIB) $(LIB) || echo Never mind.
-	@touch lib
-
-ghash-ia64.s:	asm/ghash-ia64.pl
-	$(PERL) asm/ghash-ia64.pl $@ $(CFLAGS)
-ghash-x86.s:	asm/ghash-x86.pl
-	$(PERL) asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
-ghash-x86_64.s:	asm/ghash-x86_64.pl
-	$(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@
-aesni-gcm-x86_64.s:	asm/aesni-gcm-x86_64.pl
-	$(PERL) asm/aesni-gcm-x86_64.pl $(PERLASM_SCHEME) > $@
-ghash-sparcv9.s:	asm/ghash-sparcv9.pl
-	$(PERL) asm/ghash-sparcv9.pl $@ $(CFLAGS)
-ghash-alpha.s:	asm/ghash-alpha.pl
-	(preproc=$$$$.$@.S; trap "rm $$preproc" INT; \
-	$(PERL) asm/ghash-alpha.pl > $$preproc && \
-	$(CC) -E -P $$preproc > $@ && rm $$preproc)
-ghash-parisc.s:	asm/ghash-parisc.pl
-	$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
-ghashv8-armx.S:	asm/ghashv8-armx.pl
-	$(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@
-ghashp8-ppc.s:	asm/ghashp8-ppc.pl
-	$(PERL) asm/ghashp8-ppc.pl $(PERLASM_SCHEME) $@
-
-# GNU make "catch all"
-ghash-%.S:	asm/ghash-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
-
-ghash-armv4.o:	ghash-armv4.S
-ghashv8-armx.o:	ghashv8-armx.S
-
-files:
-	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
-
-links:
-	@$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
-	@$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
-	@$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
-
-install:
-	@[ -n "$(INSTALLTOP)" ] # should be set by top Makefile...
-	@headerlist="$(EXHEADER)"; for i in $$headerlist; \
-	do  \
-	(cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
-	chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
-	done;
-
-tags:
-	ctags $(SRC)
-
-tests:
-
-lint:
-	lint -DLINT $(INCLUDES) $(SRC)>fluff
-
-update: depend
-
-depend:
-	@[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile...
-	$(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
-
-dclean:
-	$(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
-	mv -f Makefile.new $(MAKEFILE)
-
-clean:
-	rm -f *.s *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
-
-# DO NOT DELETE THIS LINE -- make depend depends on it.
-
-cbc128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-cbc128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-cbc128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-cbc128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-cbc128.o: ../../include/openssl/symhacks.h cbc128.c modes_lcl.h
-ccm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-ccm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-ccm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-ccm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-ccm128.o: ../../include/openssl/symhacks.h ccm128.c modes_lcl.h
-cfb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-cfb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-cfb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-cfb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-cfb128.o: ../../include/openssl/symhacks.h cfb128.c modes_lcl.h
-ctr128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-ctr128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-ctr128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-ctr128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-ctr128.o: ../../include/openssl/symhacks.h ctr128.c modes_lcl.h
-cts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-cts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-cts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-cts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-cts128.o: ../../include/openssl/symhacks.h cts128.c modes_lcl.h
-gcm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-gcm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-gcm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-gcm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-gcm128.o: ../../include/openssl/symhacks.h gcm128.c modes_lcl.h
-ofb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-ofb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-ofb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-ofb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-ofb128.o: ../../include/openssl/symhacks.h modes_lcl.h ofb128.c
-wrap128.o: ../../e_os.h ../../include/openssl/bio.h
-wrap128.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-wrap128.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-wrap128.o: ../../include/openssl/lhash.h ../../include/openssl/modes.h
-wrap128.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
-wrap128.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
-wrap128.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-wrap128.o: ../cryptlib.h wrap128.c
-xts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-xts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-xts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-xts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-xts128.o: ../../include/openssl/symhacks.h modes_lcl.h xts128.c
--- a/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -22,10 +29,11 @@
 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
 # pressure with notable relative improvement, achieving 1.0 cycle per
-# byte processed with 128-bit key on Haswell processor, and 0.74 -
-# on Broadwell. [Mentioned results are raw profiled measurements for
-# favourable packet size, one divisible by 96. Applications using the
-# EVP interface will observe a few percent worse performance.]
+# byte processed with 128-bit key on Haswell processor, 0.74 - on
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
+# measurements for favourable packet size, one divisible by 96.
+# Applications using the EVP interface will observe a few percent
+# worse performance.]
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
@@ -43,7 +51,7 @@ die "can't locate x86_64-xlate.pl";

 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
 		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
-	$avx = ($1>=2.19) + ($1>=2.22);
+	$avx = ($1>=2.20) + ($1>=2.22);
 }

 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
@@ -56,11 +64,11 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	$avx = ($1>=10) + ($1>=11);
 }

-if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
 	$avx = ($2>=3.0) + ($2>3.0);
 }

-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;

 if ($avx>1) {{{
@@ -108,6 +116,23 @@ _aesni_ctr32_ghash_6x:
 	  vpxor		$rndkey,$inout3,$inout3
 	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
 	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
+
+	# At this point, the current block of 96 (0x60) bytes has already been
+	# loaded into registers. Concurrently with processing it, we want to
+	# load the next 96 bytes of input for the next round. Obviously, we can
+	# only do this if there are at least 96 more bytes of input beyond the
+	# input we're currently processing, or else we'd read past the end of
+	# the input buffer. Here, we set |%r12| to 96 if there are at least 96
+	# bytes of input beyond the 96 bytes we're already processing, and we
+	# set |%r12| to 0 otherwise. In the case where we set |%r12| to 96,
+	# we'll read in the next block so that it is in registers for the next
+	# loop iteration. In the case where we set |%r12| to 0, we'll re-read
+	# the current block and then ignore what we re-read.
+	#
+	# At this point, |$in0| points to the current (already read into
+	# registers) block, and |$end0| points to 2*96 bytes before the end of
+	# the input. Thus, |$in0| > |$end0| means that we do not have the next
+	# 96-byte block to read in, and |$in0| <= |$end0| means we do.
 	xor		%r12,%r12
 	cmp		$in0,$end0

@@ -400,6 +425,9 @@ $code.=<<___;
 .align	32
 aesni_gcm_decrypt:
 	xor	$ret,$ret
+
+	# We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
+	# bytes of input.
 	cmp	\$0x60,$len			# minimal accepted length
 	jb	.Lgcm_dec_abort

@@ -454,7 +482,15 @@ $code.=<<___;
 	vmovdqu		0x50($inp),$Z3		# I[5]
 	lea		($inp),$in0
 	vmovdqu		0x40($inp),$Z0
+
+	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
+	# bytes before the end of the input. Note, in particular, that this is
+	# correct even if |$len| is not an even multiple of 96 or 16. XXX: This
+	# seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
+	# not be near the very beginning of the address space when |$len| < 2*96
+	# (0xc0).
 	lea		-0xc0($inp,$len),$end0
+
 	vmovdqu		0x30($inp),$Z1
 	shr		\$4,$len
 	xor		$ret,$ret
@@ -489,7 +525,7 @@ $code.=<<___;
 ___
 $code.=<<___ if ($win64);
 	movaps	-0xd8(%rax),%xmm6
-	movaps	-0xd8(%rax),%xmm7
+	movaps	-0xc8(%rax),%xmm7
 	movaps	-0xb8(%rax),%xmm8
 	movaps	-0xa8(%rax),%xmm9
 	movaps	-0x98(%rax),%xmm10
@@ -610,6 +646,10 @@ _aesni_ctr32_6x:
 .align	32
 aesni_gcm_encrypt:
 	xor	$ret,$ret
+
+	# We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
+	# input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
+	# least 96 more bytes of input.
 	cmp	\$0x60*3,$len			# minimal accepted length
 	jb	.Lgcm_enc_abort

@@ -659,7 +699,16 @@ $code.=<<___;
 .Lenc_no_key_aliasing:

 	lea		($out),$in0
+
+	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
+	# bytes before the end of the input. Note, in particular, that this is
+	# correct even if |$len| is not an even multiple of 96 or 16. Unlike in
+	# the decryption case, there's no caveat that |$out| must not be near
+	# the very beginning of the address space, because we know that
+	# |$len| >= 3*96 from the check above, and so we know
+	# |$out| + |$len| >= 2*96 (0xc0).
 	lea		-0xc0($out,$len),$end0
+
 	shr		\$4,$len

 	call		_aesni_ctr32_6x
--- a/crypto/modes/asm/ghash-alpha.pl
+++ b/crypto/modes/asm/ghash-alpha.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -454,7 +461,7 @@ rem_4bit:
 .align	4

 ___
-$output=shift and open STDOUT,">$output";
+$output=pop and open STDOUT,">$output";
 print $code;
 close STDOUT;

--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -42,10 +49,10 @@
 # below and combine it with reduction algorithm from x86 module.
 # Performance improvement over previous version varies from 65% on
 # Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
-# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
-# in 9.33.
+# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
+# Snapdragon S4 - in 9.33.
 #
-# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
 # Polynomial Multiplication on ARM Processors using the NEON Engine.
 # 
 # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
@@ -71,8 +78,20 @@
 # *native* byte order on current platform. See gcm128.c for working
 # example...

-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}

 $Xi="r0";	# argument block
 $Htbl="r1";
@@ -124,7 +143,19 @@ $code=<<___;
 #include "arm_arch.h"

 .text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
 .code	32
+#endif
+
+#ifdef  __clang__
+#define ldrplb  ldrbpl
+#define ldrneb  ldrbne
+#endif

 .type	rem_4bit,%object
 .align	5
@@ -137,19 +168,27 @@ rem_4bit:

 .type	rem_4bit_get,%function
 rem_4bit_get:
-	sub	$rem_4bit,pc,#8
-	sub	$rem_4bit,$rem_4bit,#32	@ &rem_4bit
+#if defined(__thumb2__)
+	adr	$rem_4bit,rem_4bit
+#else
+	sub	$rem_4bit,pc,#8+32	@ &rem_4bit
+#endif
 	b	.Lrem_4bit_got
 	nop
+	nop
 .size	rem_4bit_get,.-rem_4bit_get

 .global	gcm_ghash_4bit
 .type	gcm_ghash_4bit,%function
+.align	4
 gcm_ghash_4bit:
-	sub	r12,pc,#8
+#if defined(__thumb2__)
+	adr	r12,rem_4bit
+#else
+	sub	r12,pc,#8+48		@ &rem_4bit
+#endif
 	add	$len,$inp,$len		@ $len to point at the end
 	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
-	sub	r12,r12,#48		@ &rem_4bit

 	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
 	stmdb	sp!,{r4-r11}		@ ... to stack
@@ -196,6 +235,9 @@ gcm_ghash_4bit:
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
 	eor	$Zhl,$Thl,$Zhl,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
 	ldrplb	$nlo,[$inp,$cnt]
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
@@ -206,6 +248,9 @@ gcm_ghash_4bit:
 	add	$nhi,$nhi,$nhi
 	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
 	eor	$Zll,$Tll,$Zll,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
 	ldrplb	$Tll,[$Xi,$cnt]
 	eor	$Zll,$Zll,$Zlh,lsl#28
 	eor	$Zlh,$Tlh,$Zlh,lsr#4
@@ -213,8 +258,14 @@ gcm_ghash_4bit:
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	eor	$Zhl,$Thl,$Zhl,lsr#4
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
+#ifdef	__thumb2__
+	it	pl
+#endif
 	eorpl	$nlo,$nlo,$Tll
 	eor	$Zhh,$Thh,$Zhh,lsr#4
+#ifdef	__thumb2__
+	itt	pl
+#endif
 	andpl	$nhi,$nlo,#0xf0
 	andpl	$nlo,$nlo,#0x0f
 	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
@@ -224,7 +275,11 @@ gcm_ghash_4bit:
 	add	$inp,$inp,#16
 	mov	$nhi,$Zll
 ___
-	&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+	&Zsmash("cmp\t$inp,$len","\n".
+				 "#ifdef __thumb2__\n".
+				 "	it	ne\n".
+				 "#endif\n".
+				 "	ldrneb	$nlo,[$inp,#15]");
 $code.=<<___;
 	bne	.Louter

@@ -282,6 +337,9 @@ gcm_gmult_4bit:
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
 	eor	$Zhl,$Thl,$Zhl,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
 	ldrplb	$nlo,[$Xi,$cnt]
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
@@ -299,6 +357,9 @@ gcm_gmult_4bit:
 	eor	$Zhl,$Thl,$Zhl,lsr#4
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
+#ifdef	__thumb2__
+	itt	pl
+#endif
 	andpl	$nhi,$nlo,#0xf0
 	andpl	$nlo,$nlo,#0x0f
 	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
@@ -373,9 +434,9 @@ $code.=<<___;
 .type	gcm_init_neon,%function
 .align	4
 gcm_init_neon:
-	vld1.64		$IN#hi,[r1,:64]!	@ load H
+	vld1.64		$IN#hi,[r1]!		@ load H
 	vmov.i8		$t0,#0xe1
-	vld1.64		$IN#lo,[r1,:64]
+	vld1.64		$IN#lo,[r1]
 	vshl.i64	$t0#hi,#57
 	vshr.u64	$t0#lo,#63		@ t0=0xc2....01
 	vdup.8		$t1,$IN#hi[7]
@@ -394,8 +455,8 @@ gcm_init_neon:
 .type	gcm_gmult_neon,%function
 .align	4
 gcm_gmult_neon:
-	vld1.64		$IN#hi,[$Xi,:64]!	@ load Xi
-	vld1.64		$IN#lo,[$Xi,:64]!
+	vld1.64		$IN#hi,[$Xi]!		@ load Xi
+	vld1.64		$IN#lo,[$Xi]!
 	vmov.i64	$k48,#0x0000ffffffffffff
 	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
 	vmov.i64	$k32,#0x00000000ffffffff
@@ -412,8 +473,8 @@ gcm_gmult_neon:
 .type	gcm_ghash_neon,%function
 .align	4
 gcm_ghash_neon:
-	vld1.64		$Xl#hi,[$Xi,:64]!	@ load Xi
-	vld1.64		$Xl#lo,[$Xi,:64]!
+	vld1.64		$Xl#hi,[$Xi]!		@ load Xi
+	vld1.64		$Xl#lo,[$Xi]!
 	vmov.i64	$k48,#0x0000ffffffffffff
 	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
 	vmov.i64	$k32,#0x00000000ffffffff
@@ -432,12 +493,12 @@ gcm_ghash_neon:
 	veor		$IN,$Xl			@ inp^=Xi
 .Lgmult_neon:
 ___
-	&clmul64x64	($Xl,$Hlo,"$IN#lo");	# H.lo·Xi.lo
+	&clmul64x64	($Xl,$Hlo,"$IN#lo");	# H.lo·Xi.lo
 $code.=<<___;
 	veor		$IN#lo,$IN#lo,$IN#hi	@ Karatsuba pre-processing
 ___
-	&clmul64x64	($Xm,$Hhl,"$IN#lo");	# (H.lo+H.hi)·(Xi.lo+Xi.hi)
-	&clmul64x64	($Xh,$Hhi,"$IN#hi");	# H.hi·Xi.hi
+	&clmul64x64	($Xm,$Hhl,"$IN#lo");	# (H.lo+H.hi)·(Xi.lo+Xi.hi)
+	&clmul64x64	($Xh,$Hhi,"$IN#hi");	# H.hi·Xi.hi
 $code.=<<___;
 	veor		$Xm,$Xm,$Xl		@ Karatsuba post-processing
 	veor		$Xm,$Xm,$Xh
@@ -468,8 +529,8 @@ $code.=<<___;
 	vrev64.8	$Xl,$Xl
 #endif
 	sub		$Xi,#16	
-	vst1.64		$Xl#hi,[$Xi,:64]!	@ write out Xi
-	vst1.64		$Xl#lo,[$Xi,:64]
+	vst1.64		$Xl#hi,[$Xi]!		@ write out Xi
+	vst1.64		$Xl#lo,[$Xi]

 	ret					@ bx lr
 .size	gcm_ghash_neon,.-gcm_ghash_neon
--- a/crypto/modes/asm/ghash-c64xplus.pl
+++ b/crypto/modes/asm/ghash-c64xplus.pl
@@ -0,0 +1,247 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# December 2011
+#
+# The module implements GCM GHASH function and underlying single
+# multiplication operation in GF(2^128). Even though subroutines
+# have _4bit suffix, they are not using any tables, but rely on
+# hardware Galois Field Multiply support. Streamed GHASH processes
+# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
+# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
+# comparing apples vs. oranges, but compiler surely could have done
+# better, because theoretical [though not necessarily achievable]
+# estimate for "4-bit" table-driven implementation is ~12 cycles.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6");	# arguments
+
+($Z0,$Z1,$Z2,$Z3,	$H0, $H1, $H2, $H3,
+			$H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
+($H01u,$H01y,$H2u,$H3u,	$H0y,$H1y,$H2y,$H3y,
+			$H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
+($FF000000,$E10000)=("B30","B31");
+($xip,$x0,$x1,$xib)=map("B$_",(6..9));	# $xip zaps $len
+ $xia="A9";
+($rem,$res)=("B4","B5");		# $rem zaps $Htable
+
+$code.=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	gcm_gmult_1bit,_gcm_gmult_1bit
+	.asg	gcm_gmult_4bit,_gcm_gmult_4bit
+	.asg	gcm_ghash_4bit,_gcm_ghash_4bit
+	.endif
+
+	.asg	B3,RA
+
+	.if	0
+	.global	_gcm_gmult_1bit
+_gcm_gmult_1bit:
+	ADDAD	$Htable,2,$Htable
+	.endif
+	.global	_gcm_gmult_4bit
+_gcm_gmult_4bit:
+	.asmfunc
+	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
+	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
+||	MV	$Xip,${xip}		; reassign Xi
+||	MVK	15,B1			; SPLOOPD constant
+
+	MVK	0xE1,$E10000
+||	LDBU	*++${xip}[15],$x1	; Xi[15]
+	MVK	0xFF,$FF000000
+||	LDBU	*--${xip},$x0		; Xi[14]
+	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
+	SHL	$FF000000,24,$FF000000	; upper byte mask
+||	BNOP	ghash_loop?
+||	MVK	1,B0			; take a single spin
+
+	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
+	AND	$H2,$FF000000,$H2u	; H2's upper byte
+	AND	$H3,$FF000000,$H3u	; H3's upper byte
+||	SHRU	$H2u,8,$H2u
+	SHRU	$H3u,8,$H3u
+||	ZERO	$Z1:$Z0
+	SHRU2	$xia,8,$H01u
+||	ZERO	$Z3:$Z2
+	.endasmfunc
+
+	.global	_gcm_ghash_4bit
+_gcm_ghash_4bit:
+	.asmfunc
+	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
+||	SHRU	$len,4,B0		; reassign len
+	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
+||	MV	$Xip,${xip}		; reassign Xi
+||	MVK	15,B1			; SPLOOPD constant
+
+	MVK	0xE1,$E10000
+|| [B0]	LDNDW	*${inp}[1],$H1x:$H0x
+	MVK	0xFF,$FF000000
+|| [B0]	LDNDW	*${inp}++[2],$H3x:$H2x
+	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
+||	LDDW	*${xip}[1],$Z1:$Z0
+	SHL	$FF000000,24,$FF000000	; upper byte mask
+||	LDDW	*${xip}[0],$Z3:$Z2
+
+	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
+	AND	$H2,$FF000000,$H2u	; H2's upper byte
+	AND	$H3,$FF000000,$H3u	; H3's upper byte
+||	SHRU	$H2u,8,$H2u
+	SHRU	$H3u,8,$H3u
+	SHRU2	$xia,8,$H01u
+
+|| [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+	.if	.LITTLE_ENDIAN
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.else
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.endif
+	STDW	$Z3:$Z2,*${xip}[0]
+|| [B0]	ZERO	$Z3:$Z2
+|| [B0]	MV	$xia,$x1
+   [B0]	ADDK	14,${xip}
+
+ghash_loop?:
+	SPLOOPD	6			; 6*16+7
+||	MVC	B1,ILC
+|| [B0]	SUB	B0,1,B0
+||	ZERO	A0
+||	ADD	$x1,$x1,$xib		; SHL	$x1,1,$xib
+||	SHL	$x1,1,$xia
+___
+
+########____________________________
+#  0    D2.     M1          M2      |
+#  1            M1                  |
+#  2            M1          M2      |
+#  3        D1. M1          M2      |
+#  4        S1. L1                  |
+#  5    S2  S1x L1          D2  L2  |____________________________
+#  6/0          L1  S1      L2  S2x |D2.     M1          M2      |
+#  7/1          L1  S1  D1x S2  M2  |        M1                  |
+#  8/2              S1  L1x S2      |        M1          M2      |
+#  9/3              S1  L1x         |    D1. M1          M2      |
+# 10/4                  D1x         |    S1. L1                  |
+# 11/5                              |S2  S1x L1          D2  L2  |____________
+# 12/6/0                D1x       __|        L1  S1      L2  S2x |D2.     ....
+#    7/1                                     L1  S1  D1x S2  M2  |        ....
+#    8/2                                         S1  L1x S2      |        ....
+#####...                                         ................|............
+$code.=<<___;
+	XORMPY	$H0,$xia,$H0x		; 0	; H·(Xi[i]<<1)
+||	XORMPY	$H01u,$xib,$H01y
+|| [A0]	LDBU	*--${xip},$x0
+	XORMPY	$H1,$xia,$H1x		; 1
+	XORMPY	$H2,$xia,$H2x		; 2
+||	XORMPY	$H2u,$xib,$H2y
+	XORMPY	$H3,$xia,$H3x		; 3
+||	XORMPY	$H3u,$xib,$H3y
+||[!A0]	MVK.D	15,A0				; *--${xip} counter
+	XOR.L	$H0x,$Z0,$Z0		; 4	; Z^=H·(Xi[i]<<1)
+|| [A0]	SUB.S	A0,1,A0
+	XOR.L	$H1x,$Z1,$Z1		; 5
+||	AND.D	$H01y,$FF000000,$H0z
+||	SWAP2.L	$H01y,$H1y		;	; SHL	$H01y,16,$H1y
+||	SHL	$x0,1,$xib
+||	SHL	$x0,1,$xia
+
+	XOR.L	$H2x,$Z2,$Z2		; 6/0	; [0,0] in epilogue
+||	SHL	$Z0,1,$rem		;	; rem=Z<<1
+||	SHRMB.S	$Z1,$Z0,$Z0		;	; Z>>=8
+||	AND.L	$H1y,$FF000000,$H1z
+	XOR.L	$H3x,$Z3,$Z3		; 7/1
+||	SHRMB.S	$Z2,$Z1,$Z1
+||	XOR.D	$H0z,$Z0,$Z0			; merge upper byte products
+||	AND.S	$H2y,$FF000000,$H2z
+||	XORMPY	$E10000,$rem,$res	;	; implicit rem&0x1FE
+	XOR.L	$H1z,$Z1,$Z1		; 8/2
+||	SHRMB.S	$Z3,$Z2,$Z2
+||	AND.S	$H3y,$FF000000,$H3z
+	XOR.L	$H2z,$Z2,$Z2		; 9/3
+||	SHRU	$Z3,8,$Z3
+	XOR.D	$H3z,$Z3,$Z3		; 10/4
+	NOP				; 11/5
+
+	SPKERNEL 0,2
+||	XOR.D	$res,$Z3,$Z3		; 12/6/0; Z^=res
+
+	; input pre-fetch is possible where D1 slot is available...
+   [B0]	LDNDW	*${inp}[1],$H1x:$H0x	; 8/-
+   [B0]	LDNDW	*${inp}++[2],$H3x:$H2x	; 9/-
+	NOP				; 10/-
+	.if	.LITTLE_ENDIAN
+	SWAP2	$Z0,$Z1			; 11/-
+||	SWAP4	$Z1,$Z0
+	SWAP4	$Z1,$Z1			; 12/-
+||	SWAP2	$Z0,$Z0
+	SWAP2	$Z2,$Z3
+||	SWAP4	$Z3,$Z2
+||[!B0]	BNOP	RA
+	SWAP4	$Z3,$Z3
+||	SWAP2	$Z2,$Z2
+|| [B0]	BNOP	ghash_loop?
+   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.else
+  [!B0]	BNOP	RA			; 11/-
+   [B0]	BNOP	ghash_loop?		; 12/-
+   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.endif
+	STDW	$Z3:$Z2,*${xip}[0]
+|| [B0]	ZERO	$Z3:$Z2
+|| [B0]	MV	$xia,$x1
+   [B0]	ADDK	14,${xip}
+	.endasmfunc
+
+	.sect	.const
+	.cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
--- a/crypto/modes/asm/ghash-ia64.pl
+++ b/crypto/modes/asm/ghash-ia64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+

 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -32,7 +39,7 @@
 # Itanium performance should remain the same as the "256B" version,
 # i.e. ~8.5 cycles.

-$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
+$output=pop and (open STDOUT,">$output" or die "can't open $output: $!");

 if ($^O eq "hpux") {
    $ADDP="addp4";
--- a/crypto/modes/asm/ghash-parisc.pl
+++ b/crypto/modes/asm/ghash-parisc.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
--- a/crypto/modes/asm/ghash-s390x.pl
+++ b/crypto/modes/asm/ghash-s390x.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+

 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -47,7 +54,7 @@ if ($flavour =~ /3[12]/) {
 	$g="g";
 }

-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";

 $softonly=0;
@@ -85,9 +92,7 @@ $code.=<<___ if(!$softonly && 0);	# hardware is slow for single block...
 	tmhl	%r0,0x4000	# check for message-security-assist
 	jz	.Lsoft_gmult
 	lghi	%r0,0
-	la	%r1,16($sp)
-	.long	0xb93e0004	# kimd %r0,%r4
-	lg	%r1,24($sp)
+	lg	%r1,24(%r1)	# load second word of kimd capabilities vector
 	tmhh	%r1,0x4000	# check for function 65
 	jz	.Lsoft_gmult
 	stg	%r0,16($sp)	# arrange 16 bytes of zero input
--- a/crypto/modes/asm/ghash-sparcv9.pl
+++ b/crypto/modes/asm/ghash-sparcv9.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+

 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -46,14 +53,12 @@
 # saturates at ~15.5x single-process result on 8-core processor,
 # or ~20.5GBps per 2.85GHz socket.

-$bits=32;
-for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64)  { $bias=2047; $frame=192; }
-else            { $bias=0;    $frame=112; }
-
-$output=shift;
+$output=pop;
 open STDOUT,">$output";

+$frame="STACK_FRAME";
+$bias="STACK_BIAS";
+
 $Zhi="%o0";	# 64-bit values
 $Zlo="%o1";
 $Thi="%o2";
@@ -75,11 +80,14 @@ $Htbl="%i1";
 $inp="%i2";
 $len="%i3";

-$code.=<<___ if ($bits==64);
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef  __arch64__
 .register	%g2,#scratch
 .register	%g3,#scratch
-___
-$code.=<<___;
+#endif
+
 .section	".text",#alloc,#execinstr

 .align	64
@@ -183,7 +191,7 @@ gcm_ghash_4bit:

 	add	$inp,16,$inp
 	cmp	$inp,$len
-	be,pn	`$bits==64?"%xcc":"%icc"`,.Ldone
+	be,pn	SIZE_T_CC,.Ldone
 	and	$Zlo,0xf,$remi

 	ldx	[$Htblo+$nhi],$Tlo
@@ -379,7 +387,7 @@ gcm_init_vis3:
 	or	$V,%lo(0xA0406080),$V
 	or	%l0,%lo(0x20C0E000),%l0
 	sllx	$V,32,$V
-	or	%l0,$V,$V		! (0xE0·i)&0xff=0xA040608020C0E000
+	or	%l0,$V,$V		! (0xE0·i)&0xff=0xA040608020C0E000
 	stx	$V,[%i0+16]

 	ret
@@ -399,7 +407,7 @@ gcm_gmult_vis3:

 	mov	0xE1,%l7
 	sllx	%l7,57,$xE1		! 57 is not a typo
-	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000
+	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000

 	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
 	xmulx	$Xlo,$Hlo,$C0
@@ -411,9 +419,9 @@ gcm_gmult_vis3:
 	xmulx	$Xhi,$Hhi,$Xhi

 	sll	$C0,3,$sqr
-	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
+	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
 	xor	$C0,$sqr,$sqr
-	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]
+	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]

 	xor	$C0,$C1,$C1		! Karatsuba post-processing
 	xor	$Xlo,$C2,$C2
@@ -423,7 +431,7 @@ gcm_gmult_vis3:
 	xor	$Xhi,$C2,$C2
 	xor	$Xhi,$C1,$C1

-	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
+	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
 	 xor	$C0,$C2,$C2
 	xmulx	$C1,$xE1,$C0
 	 xor	$C1,$C3,$C3
@@ -445,6 +453,8 @@ gcm_gmult_vis3:
 .align	32
 gcm_ghash_vis3:
 	save	%sp,-$frame,%sp
+	nop
+	srln	$len,0,$len		! needed on v8+, "nop" on v9

 	ldx	[$Xip+8],$C2		! load Xi
 	ldx	[$Xip+0],$C3
@@ -453,7 +463,7 @@ gcm_ghash_vis3:

 	mov	0xE1,%l7
 	sllx	%l7,57,$xE1		! 57 is not a typo
-	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000
+	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000

 	and	$inp,7,$shl
 	andn	$inp,7,$inp
@@ -490,9 +500,9 @@ gcm_ghash_vis3:
 	xmulx	$Xhi,$Hhi,$Xhi

 	sll	$C0,3,$sqr
-	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
+	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
 	xor	$C0,$sqr,$sqr
-	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]
+	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]

 	xor	$C0,$C1,$C1		! Karatsuba post-processing
 	xor	$Xlo,$C2,$C2
@@ -502,7 +512,7 @@ gcm_ghash_vis3:
 	xor	$Xhi,$C2,$C2
 	xor	$Xhi,$C1,$C1

-	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
+	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
 	 xor	$C0,$C2,$C2
 	xmulx	$C1,$xE1,$C0
 	 xor	$C1,$C3,$C3
@@ -530,7 +540,7 @@ ___

 # Purpose of these subroutines is to explicitly encode VIS instructions,
 # so that one can compile the module without having to specify VIS
-# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 # Idea is to reserve for option to produce "universal" binary and let
 # programmer detect if current CPU is VIS capable at run-time.
 sub unvis3 {
--- a/crypto/modes/asm/ghash-x86.pl
+++ b/crypto/modes/asm/ghash-x86.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -88,7 +95,7 @@
 # where Tproc is time required for Karatsuba pre- and post-processing,
 # is more realistic estimate. In this case it gives ... 1.91 cycles.
 # Or in other words, depending on how well we can interleave reduction
-# and one of the two multiplications the performance should be betwen
+# and one of the two multiplications the performance should be between
 # 1.91 and 2.16. As already mentioned, this implementation processes
 # one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
 # - in 2.02. x86_64 performance is better, because larger register
@@ -129,6 +136,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";

+$output=pop;
+open STDOUT,">$output";
+
 &asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");

 $sse2=0;
@@ -358,7 +368,7 @@ $S=12;		# shift factor for rem_4bit
 # effective address calculation and finally merge of value to Z.hi.
 # Reference to rem_4bit is scheduled so late that I had to >>4
 # rem_4bit elements. This resulted in 20-45% procent improvement
-# on contemporary µ-archs.
+# on contemporary µ-archs.
 {
    my $cnt;
    my $rem_4bit = "eax";
@@ -712,7 +722,7 @@ sub mmx_loop() {
    &pxor	($red[1],$red[1]);
    &pxor	($red[2],$red[2]);

-    # Just like in "May" verson modulo-schedule for critical path in
+    # Just like in "May" version modulo-schedule for critical path in
    # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
    # is scheduled so late that rem_8bit[] has to be shifted *right*
    # by 16, which is why last argument to pinsrw is 2, which
@@ -1138,7 +1148,7 @@ my ($Xhi,$Xi) = @_;
 	&movdqu		(&QWP(0,$Xip),$Xi);
 &function_end("gcm_ghash_clmul");

-} else {		# Algorith 5. Kept for reference purposes.
+} else {		# Algorithm 5. Kept for reference purposes.

 sub reduction_alg5 {	# 19/16 times faster than Intel version
 my ($Xhi,$Xi)=@_;
@@ -1369,6 +1379,8 @@ my ($Xhi,$Xi)=@_;
 &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
 &asm_finish();

+close STDOUT;
+
 # A question was risen about choice of vanilla MMX. Or rather why wasn't
 # SSE2 chosen instead? In addition to the fact that MMX runs on legacy
 # CPUs such as PIII, "4-bit" MMX version was observed to provide better
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -64,8 +71,10 @@
 # Ivy Bridge	1.80(+7%)
 # Haswell	0.55(+93%) (if system doesn't support AVX)
 # Broadwell	0.45(+110%)(if system doesn't support AVX)
+# Skylake	0.44(+110%)(if system doesn't support AVX)
 # Bulldozer	1.49(+27%)
 # Silvermont	2.88(+13%)
+# Goldmont	1.08(+24%)

 # March 2013
 #
@@ -74,8 +83,8 @@
 # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
 # sub-optimally in comparison to above mentioned version. But thanks
 # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
-# it performs in 0.41 cycles per byte on Haswell processor, and in
-# 0.29 on Broadwell.
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest

@@ -92,7 +101,7 @@ die "can't locate x86_64-xlate.pl";

 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
 		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
-	$avx = ($1>=2.19) + ($1>=2.22);
+	$avx = ($1>=2.20) + ($1>=2.22);
 }

 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
@@ -105,11 +114,11 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	$avx = ($1>=10) + ($1>=11);
 }

-if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
 	$avx = ($2>=3.0) + ($2>3.0);
 }

-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;

 $do4xaggr=1;
@@ -576,15 +585,15 @@ $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
 	# experimental alternative. special thing about is that there
 	# no dependency between the two multiplications... 
 	mov		\$`0xE1<<1`,%eax
-	mov		\$0xA040608020C0E000,%r10	# ((7..0)·0xE0)&0xff
+	mov		\$0xA040608020C0E000,%r10	# ((7..0)·0xE0)&0xff
 	mov		\$0x07,%r11d
 	movq		%rax,$T1
 	movq		%r10,$T2
 	movq		%r11,$T3		# borrow $T3
 	pand		$Xi,$T3
-	pshufb		$T3,$T2			# ($Xi&7)·0xE0
+	pshufb		$T3,$T2			# ($Xi&7)·0xE0
 	movq		%rax,$T3
-	pclmulqdq	\$0x00,$Xi,$T1		# ·(0xE1<<1)
+	pclmulqdq	\$0x00,$Xi,$T1		# ·(0xE1<<1)
 	pxor		$Xi,$T2
 	pslldq		\$15,$T2
 	paddd		$T2,$T2			# <<(64+56+1)
@@ -657,7 +666,7 @@ $code.=<<___;
 	je		.Lskip4x

 	sub		\$0x30,$len
-	mov		\$0xA040608020C0E000,%rax	# ((7..0)·0xE0)&0xff
+	mov		\$0xA040608020C0E000,%rax	# ((7..0)·0xE0)&0xff
 	movdqu		0x30($Htbl),$Hkey3
 	movdqu		0x40($Htbl),$Hkey4

--- a/crypto/modes/asm/ghashp8-ppc.pl
+++ b/crypto/modes/asm/ghashp8-ppc.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -18,6 +25,12 @@
 # faster than "4-bit" integer-only compiler-generated 64-bit code.
 # "Initial version" means that there is room for futher improvement.

+# May 2016
+#
+# 2x aggregated reduction improves performance by 50% (resulting
+# performance on POWER8 is 1 cycle per processed byte), and 4x
+# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
+
 $flavour=shift;
 $output =shift;

@@ -27,14 +40,21 @@ if ($flavour =~ /64/) {
 	$STU="stdu";
 	$POP="ld";
 	$PUSH="std";
+	$UCMP="cmpld";
+	$SHRI="srdi";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T=4;
 	$LRSAVE=$SIZE_T;
 	$STU="stwu";
 	$POP="lwz";
 	$PUSH="stw";
+	$UCMP="cmplw";
+	$SHRI="srwi";
 } else { die "nonsense $flavour"; }

+$sp="r1";
+$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -46,6 +66,7 @@ my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block

 my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
 my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
 my $vrsave="r12";

 $code=<<___;
@@ -56,7 +77,7 @@ $code=<<___;
 .globl	.gcm_init_p8
 .align	5
 .gcm_init_p8:
-	lis		r0,0xfff0
+	li		r0,-4096
 	li		r8,0x10
 	mfspr		$vrsave,256
 	li		r9,0x20
@@ -78,17 +99,103 @@ $code=<<___;
 	vsl		$H,$H,$t0		# H<<=1
 	vsrab		$t1,$t1,$t2		# broadcast carry bit
 	vand		$t1,$t1,$xC2
-	vxor		$H,$H,$t1		# twisted H
+	vxor		$IN,$H,$t1		# twisted H

-	vsldoi		$H,$H,$H,8		# twist even more ...
+	vsldoi		$H,$IN,$IN,8		# twist even more ...
 	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
 	vsldoi		$Hl,$zero,$H,8		# ... and split
 	vsldoi		$Hh,$H,$zero,8

 	stvx_u		$xC2,0,r3		# save pre-computed table
 	stvx_u		$Hl,r8,r3
+	li		r8,0x40
 	stvx_u		$H, r9,r3
+	li		r9,0x50
 	stvx_u		$Hh,r10,r3
+	li		r10,0x60
+
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·H.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·H.lo+H.lo·H.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·H.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$IN1,$Xl,$t1
+
+	vsldoi		$H2,$IN1,$IN1,8
+	vsldoi		$H2l,$zero,$H2,8
+	vsldoi		$H2h,$H2,$zero,8
+
+	stvx_u		$H2l,r8,r3		# save H^2
+	li		r8,0x70
+	stvx_u		$H2,r9,r3
+	li		r9,0x80
+	stvx_u		$H2h,r10,r3
+	li		r10,0x90
+___
+{
+my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
+$code.=<<___;
+	vpmsumd		$Xl,$IN,$H2l		# H.lo·H^2.lo
+	 vpmsumd	$Xl1,$IN1,$H2l		# H^2.lo·H^2.lo
+	vpmsumd		$Xm,$IN,$H2		# H.hi·H^2.lo+H.lo·H^2.hi
+	 vpmsumd	$Xm1,$IN1,$H2		# H^2.hi·H^2.lo+H^2.lo·H^2.hi
+	vpmsumd		$Xh,$IN,$H2h		# H.hi·H^2.hi
+	 vpmsumd	$Xh1,$IN1,$H2h		# H^2.hi·H^2.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+	 vpmsumd	$t6,$Xl1,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vsldoi		$t4,$Xm1,$zero,8
+	 vsldoi		$t5,$zero,$Xm1,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+	 vxor		$Xl1,$Xl1,$t4
+	 vxor		$Xh1,$Xh1,$t5
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	 vsldoi		$Xl1,$Xl1,$Xl1,8
+	vxor		$Xl,$Xl,$t2
+	 vxor		$Xl1,$Xl1,$t6
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	 vsldoi		$t5,$Xl1,$Xl1,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 vpmsumd	$Xl1,$Xl1,$xC2
+	vxor		$t1,$t1,$Xh
+	 vxor		$t5,$t5,$Xh1
+	vxor		$Xl,$Xl,$t1
+	 vxor		$Xl1,$Xl1,$t5
+
+	vsldoi		$H,$Xl,$Xl,8
+	 vsldoi		$H2,$Xl1,$Xl1,8
+	vsldoi		$Hl,$zero,$H,8
+	vsldoi		$Hh,$H,$zero,8
+	 vsldoi		$H2l,$zero,$H2,8
+	 vsldoi		$H2h,$H2,$zero,8
+
+	stvx_u		$Hl,r8,r3		# save H^3
+	li		r8,0xa0
+	stvx_u		$H,r9,r3
+	li		r9,0xb0
+	stvx_u		$Hh,r10,r3
+	li		r10,0xc0
+	 stvx_u		$H2l,r8,r3		# save H^4
+	 stvx_u		$H2,r9,r3
+	 stvx_u		$H2h,r10,r3

 	mtspr		256,$vrsave
 	blr
@@ -96,7 +203,9 @@ $code=<<___;
 	.byte		0,12,0x14,0,0,0,2,0
 	.long		0
 .size	.gcm_init_p8,.-.gcm_init_p8
-
+___
+}
+$code.=<<___;
 .globl	.gcm_gmult_p8
 .align	5
 .gcm_gmult_p8:
@@ -118,11 +227,11 @@ $code=<<___;
 	 le?vperm	$IN,$IN,$IN,$lemask
 	vxor		$zero,$zero,$zero

-	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
-	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
-	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi

-	vpmsumd		$t2,$Xl,$xC2		# 1st phase
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase

 	vsldoi		$t0,$Xm,$zero,8
 	vsldoi		$t1,$zero,$Xm,8
@@ -132,7 +241,7 @@ $code=<<___;
 	vsldoi		$Xl,$Xl,$Xl,8
 	vxor		$Xl,$Xl,$t2

-	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
 	vpmsumd		$Xl,$Xl,$xC2
 	vxor		$t1,$t1,$Xh
 	vxor		$Xl,$Xl,$t1
@@ -150,7 +259,7 @@ $code=<<___;
 .globl	.gcm_ghash_p8
 .align	5
 .gcm_ghash_p8:
-	lis		r0,0xfff8
+	li		r0,-4096
 	li		r8,0x10
 	mfspr		$vrsave,256
 	li		r9,0x20
@@ -159,33 +268,85 @@ $code=<<___;
 	lvx_u		$Xl,0,$Xip		# load Xi

 	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	li		r8,0x40
 	 le?lvsl	$lemask,r0,r0
 	lvx_u		$H, r9,$Htbl
+	li		r9,0x50
 	 le?vspltisb	$t0,0x07
 	lvx_u		$Hh,r10,$Htbl
+	li		r10,0x60
 	 le?vxor	$lemask,$lemask,$t0
 	lvx_u		$xC2,0,$Htbl
 	 le?vperm	$Xl,$Xl,$Xl,$lemask
 	vxor		$zero,$zero,$zero

+	${UCMP}i	$len,64
+	bge		Lgcm_ghash_p8_4x
+
 	lvx_u		$IN,0,$inp
 	addi		$inp,$inp,16
-	subi		$len,$len,16
+	subic.		$len,$len,16
 	 le?vperm	$IN,$IN,$IN,$lemask
 	vxor		$IN,$IN,$Xl
-	b		Loop
+	beq		Lshort
+
+	lvx_u		$H2l,r8,$Htbl		# load H^2
+	li		r8,16
+	lvx_u		$H2, r9,$Htbl
+	add		r9,$inp,$len		# end of input
+	lvx_u		$H2h,r10,$Htbl
+	be?b		Loop_2x

 .align	5
-Loop:
-	 subic		$len,$len,16
-	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
-	 subfe.		r0,r0,r0		# borrow?-1:0
-	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+Loop_2x:
+	lvx_u		$IN1,0,$inp
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+
+	 subic		$len,$len,32
+	vpmsumd		$Xl,$IN,$H2l		# H^2.lo·Xi.lo
+	 vpmsumd	$Xl1,$IN1,$Hl		# H.lo·Xi+1.lo
+	 subfe		r0,r0,r0		# borrow?-1:0
+	vpmsumd		$Xm,$IN,$H2		# H^2.hi·Xi.lo+H^2.lo·Xi.hi
+	 vpmsumd	$Xm1,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+1.hi
 	 and		r0,r0,$len
-	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+	vpmsumd		$Xh,$IN,$H2h		# H^2.hi·Xi.hi
+	 vpmsumd	$Xh1,$IN1,$Hh		# H.hi·Xi+1.hi
 	 add		$inp,$inp,r0

-	vpmsumd		$t2,$Xl,$xC2		# 1st phase
+	vxor		$Xl,$Xl,$Xl1
+	vxor		$Xm,$Xm,$Xm1
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vxor		$Xh,$Xh,$Xh1
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+	 lvx_u		$IN,r8,$inp
+	 addi		$inp,$inp,32
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$t1,$t1,$Xh
+	vxor		$IN,$IN,$t1
+	vxor		$IN,$IN,$Xl
+	$UCMP		r9,$inp
+	bgt		Loop_2x			# done yet?
+
+	cmplwi		$len,0
+	bne		Leven
+
+Lshort:
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase

 	vsldoi		$t0,$Xm,$zero,8
 	vsldoi		$t1,$zero,$Xm,8
@@ -194,17 +355,12 @@ Loop:

 	vsldoi		$Xl,$Xl,$Xl,8
 	vxor		$Xl,$Xl,$t2
-	 lvx_u		$IN,0,$inp
-	 addi		$inp,$inp,16

-	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
 	vpmsumd		$Xl,$Xl,$xC2
-	 le?vperm	$IN,$IN,$IN,$lemask
 	vxor		$t1,$t1,$Xh
-	vxor		$IN,$IN,$t1
-	vxor		$IN,$IN,$Xl
-	beq		Loop			# did $len-=16 borrow?

+Leven:
 	vxor		$Xl,$Xl,$t1
 	le?vperm	$Xl,$Xl,$Xl,$lemask
 	stvx_u		$Xl,0,$Xip		# write out Xi
@@ -214,6 +370,284 @@ Loop:
 	.long		0
 	.byte		0,12,0x14,0,0,0,4,0
 	.long		0
+___
+{
+my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
+    $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
+my $IN0=$IN;
+my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
+
+$code.=<<___;
+.align	5
+.gcm_ghash_p8_4x:
+Lgcm_ghash_p8_4x:
+	$STU		$sp,-$FRAME($sp)
+	li		r10,`15+6*$SIZE_T`
+	li		r11,`31+6*$SIZE_T`
+	stvx		v20,r10,$sp
+	addi		r10,r10,32
+	stvx		v21,r11,$sp
+	addi		r11,r11,32
+	stvx		v22,r10,$sp
+	addi		r10,r10,32
+	stvx		v23,r11,$sp
+	addi		r11,r11,32
+	stvx		v24,r10,$sp
+	addi		r10,r10,32
+	stvx		v25,r11,$sp
+	addi		r11,r11,32
+	stvx		v26,r10,$sp
+	addi		r10,r10,32
+	stvx		v27,r11,$sp
+	addi		r11,r11,32
+	stvx		v28,r10,$sp
+	addi		r10,r10,32
+	stvx		v29,r11,$sp
+	addi		r11,r11,32
+	stvx		v30,r10,$sp
+	li		r10,0x60
+	stvx		v31,r11,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME-4`($sp)	# save vrsave
+	mtspr		256,r0			# preserve all AltiVec registers
+
+	lvsl		$t0,0,r8		# 0x0001..0e0f
+	#lvx_u		$H2l,r8,$Htbl		# load H^2
+	li		r8,0x70
+	lvx_u		$H2, r9,$Htbl
+	li		r9,0x80
+	vspltisb	$t1,8			# 0x0808..0808
+	#lvx_u		$H2h,r10,$Htbl
+	li		r10,0x90
+	lvx_u		$H3l,r8,$Htbl		# load H^3
+	li		r8,0xa0
+	lvx_u		$H3, r9,$Htbl
+	li		r9,0xb0
+	lvx_u		$H3h,r10,$Htbl
+	li		r10,0xc0
+	lvx_u		$H4l,r8,$Htbl		# load H^4
+	li		r8,0x10
+	lvx_u		$H4, r9,$Htbl
+	li		r9,0x20
+	lvx_u		$H4h,r10,$Htbl
+	li		r10,0x30
+
+	vsldoi		$t2,$zero,$t1,8		# 0x0000..0808
+	vaddubm		$hiperm,$t0,$t2		# 0x0001..1617
+	vaddubm		$loperm,$t1,$hiperm	# 0x0809..1e1f
+
+	$SHRI		$len,$len,4		# this allows to use sign bit
+						# as carry
+	lvx_u		$IN0,0,$inp		# load input
+	lvx_u		$IN1,r8,$inp
+	subic.		$len,$len,8
+	lvx_u		$IN2,r9,$inp
+	lvx_u		$IN3,r10,$inp
+	addi		$inp,$inp,0x40
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+	le?vperm	$IN3,$IN3,$IN3,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+
+	 vpmsumd	$Xl1,$IN1,$H3l
+	 vpmsumd	$Xm1,$IN1,$H3
+	 vpmsumd	$Xh1,$IN1,$H3h
+
+	 vperm		$H21l,$H2,$H,$hiperm
+	 vperm		$t0,$IN2,$IN3,$loperm
+	 vperm		$H21h,$H2,$H,$loperm
+	 vperm		$t1,$IN2,$IN3,$hiperm
+	 vpmsumd	$Xm2,$IN2,$H2		# H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
+	 vpmsumd	$Xl3,$t0,$H21l		# H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
+	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
+	 vpmsumd	$Xh3,$t1,$H21h		# H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
+
+	 vxor		$Xm2,$Xm2,$Xm1
+	 vxor		$Xl3,$Xl3,$Xl1
+	 vxor		$Xm3,$Xm3,$Xm2
+	 vxor		$Xh3,$Xh3,$Xh1
+
+	blt		Ltail_4x
+
+Loop_4x:
+	lvx_u		$IN0,0,$inp
+	lvx_u		$IN1,r8,$inp
+	subic.		$len,$len,4
+	lvx_u		$IN2,r9,$inp
+	lvx_u		$IN3,r10,$inp
+	addi		$inp,$inp,0x40
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+	le?vperm	$IN3,$IN3,$IN3,$lemask
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+
+	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
+	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
+	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
+	 vpmsumd	$Xl1,$IN1,$H3l
+	 vpmsumd	$Xm1,$IN1,$H3
+	 vpmsumd	$Xh1,$IN1,$H3h
+
+	vxor		$Xl,$Xl,$Xl3
+	vxor		$Xm,$Xm,$Xm3
+	vxor		$Xh,$Xh,$Xh3
+	 vperm		$t0,$IN2,$IN3,$loperm
+	 vperm		$t1,$IN2,$IN3,$hiperm
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+	 vpmsumd	$Xl3,$t0,$H21l		# H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
+	 vpmsumd	$Xh3,$t1,$H21h		# H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	 vpmsumd	$Xm2,$IN2,$H2		# H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
+	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
+	vpmsumd		$Xl,$Xl,$xC2
+
+	 vxor		$Xl3,$Xl3,$Xl1
+	 vxor		$Xh3,$Xh3,$Xh1
+	vxor		$Xh,$Xh,$IN0
+	 vxor		$Xm2,$Xm2,$Xm1
+	vxor		$Xh,$Xh,$t1
+	 vxor		$Xm3,$Xm3,$Xm2
+	vxor		$Xh,$Xh,$Xl
+	bge		Loop_4x
+
+Ltail_4x:
+	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
+	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
+	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
+
+	vxor		$Xl,$Xl,$Xl3
+	vxor		$Xm,$Xm,$Xm3
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vxor		$Xh,$Xh,$Xh3
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$Xl,$Xl,$t1
+
+	addic.		$len,$len,4
+	beq		Ldone_4x
+
+	lvx_u		$IN0,0,$inp
+	${UCMP}i	$len,2
+	li		$len,-4
+	blt		Lone
+	lvx_u		$IN1,r8,$inp
+	beq		Ltwo
+
+Lthree:
+	lvx_u		$IN2,r9,$inp
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+	vmr		$H4l,$H3l
+	vmr		$H4, $H3
+	vmr		$H4h,$H3h
+
+	vperm		$t0,$IN1,$IN2,$loperm
+	vperm		$t1,$IN1,$IN2,$hiperm
+	vpmsumd		$Xm2,$IN1,$H2		# H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
+	vpmsumd		$Xm3,$IN2,$H		# H.hi·Xi+2.lo  +H.lo·Xi+2.hi
+	vpmsumd		$Xl3,$t0,$H21l		# H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
+	vpmsumd		$Xh3,$t1,$H21h		# H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
+
+	vxor		$Xm3,$Xm3,$Xm2
+	b		Ltail_4x
+
+.align	4
+Ltwo:
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+	vperm		$t0,$zero,$IN1,$loperm
+	vperm		$t1,$zero,$IN1,$hiperm
+
+	vsldoi		$H4l,$zero,$H2,8
+	vmr		$H4, $H2
+	vsldoi		$H4h,$H2,$zero,8
+
+	vpmsumd		$Xl3,$t0, $H21l		# H.lo·Xi+1.lo
+	vpmsumd		$Xm3,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+2.hi
+	vpmsumd		$Xh3,$t1, $H21h		# H.hi·Xi+1.hi
+
+	b		Ltail_4x
+
+.align	4
+Lone:
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+
+	vsldoi		$H4l,$zero,$H,8
+	vmr		$H4, $H
+	vsldoi		$H4h,$H,$zero,8
+
+	vxor		$Xh,$IN0,$Xl
+	vxor		$Xl3,$Xl3,$Xl3
+	vxor		$Xm3,$Xm3,$Xm3
+	vxor		$Xh3,$Xh3,$Xh3
+
+	b		Ltail_4x
+
+Ldone_4x:
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	li		r10,`15+6*$SIZE_T`
+	li		r11,`31+6*$SIZE_T`
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	addi		$sp,$sp,$FRAME
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,0,4,0
+	.long		0
+___
+}
+$code.=<<___;
 .size	.gcm_ghash_p8,.-.gcm_ghash_p8

 .asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
@@ -221,6 +655,8 @@ Loop:
 ___

 foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
 	if ($flavour =~ /le$/o) {	# little-endian
 	    s/le\?//o		or
 	    s/be\?/#be#/o;
--- a/crypto/modes/asm/ghashv8-armx.pl
+++ b/crypto/modes/asm/ghashv8-armx.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -27,11 +34,21 @@
 # Apple A7	0.92		5.62
 # Cortex-A53	1.01		8.39
 # Cortex-A57	1.17		7.61
+# Denver	0.71		6.02
+# Mongoose	1.10		8.06
 #
 # (*)	presented for reference/comparison purposes;

 $flavour = shift;
-open STDOUT,">".shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;

 $Xi="x0";	# argument block
 $Htbl="x1";
@@ -135,10 +152,10 @@ gcm_gmult_v8:
 #endif
 	vext.8		$IN,$t1,$t1,#8

-	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
+	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
 	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
-	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
-	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)

 	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
 	veor		$t2,$Xl,$Xh
@@ -226,7 +243,7 @@ $code.=<<___;
 #endif
 	vext.8		$In,$t1,$t1,#8
 	veor		$IN,$IN,$Xl		@ I[i]^=Xi
-	vpmull.p64	$Xln,$H,$In		@ H·Ii+1
+	vpmull.p64	$Xln,$H,$In		@ H·Ii+1
 	veor		$t1,$t1,$In		@ Karatsuba pre-processing
 	vpmull2.p64	$Xhn,$H,$In
 	b		.Loop_mod2x_v8
@@ -235,14 +252,14 @@ $code.=<<___;
 .Loop_mod2x_v8:
 	vext.8		$t2,$IN,$IN,#8
 	subs		$len,$len,#32		@ is there more data?
-	vpmull.p64	$Xl,$H2,$IN		@ H^2.lo·Xi.lo
+	vpmull.p64	$Xl,$H2,$IN		@ H^2.lo·Xi.lo
 	cclr		$inc,lo			@ is it time to zero $inc?

 	 vpmull.p64	$Xmn,$Hhl,$t1
 	veor		$t2,$t2,$IN		@ Karatsuba pre-processing
-	vpmull2.p64	$Xh,$H2,$IN		@ H^2.hi·Xi.hi
+	vpmull2.p64	$Xh,$H2,$IN		@ H^2.hi·Xi.hi
 	veor		$Xl,$Xl,$Xln		@ accumulate
-	vpmull2.p64	$Xm,$Hhl,$t2		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	vpmull2.p64	$Xm,$Hhl,$t2		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
 	 vld1.64	{$t0},[$inp],$inc	@ load [rotated] I[i+2]

 	veor		$Xh,$Xh,$Xhn
@@ -267,7 +284,7 @@ $code.=<<___;
 	 vext.8		$In,$t1,$t1,#8
 	 vext.8		$IN,$t0,$t0,#8
 	veor		$Xl,$Xm,$t2
-	 vpmull.p64	$Xln,$H,$In		@ H·Ii+1
+	 vpmull.p64	$Xln,$H,$In		@ H·Ii+1
 	veor		$IN,$IN,$Xh		@ accumulate $IN early

 	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
@@ -291,10 +308,10 @@ $code.=<<___;
 	veor		$IN,$IN,$Xl		@ inp^=Xi
 	veor		$t1,$t0,$t2		@ $t1 is rotated inp^Xi

-	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
+	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
 	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
-	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
-	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)

 	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
 	veor		$t2,$Xl,$Xh
--- a/crypto/modes/build.info
+++ b/crypto/modes/build.info
@@ -0,0 +1,27 @@
+LIBS=../../libcrypto
+SOURCE[../../libcrypto]=\
+        cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
+        ccm128.c xts128.c wrap128.c ocb128.c \
+        {- $target{modes_asm_src} -}
+
+INCLUDE[gcm128.o]=..
+
+GENERATE[ghash-ia64.s]=asm/ghash-ia64.pl $(CFLAGS) $(LIB_CFLAGS)
+GENERATE[ghash-x86.s]=asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR)
+GENERATE[ghash-x86_64.s]=asm/ghash-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[aesni-gcm-x86_64.s]=asm/aesni-gcm-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[ghash-sparcv9.S]=asm/ghash-sparcv9.pl $(PERLASM_SCHEME)
+INCLUDE[ghash-sparcv9.o]=..
+GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl $(PERLASM_SCHEME)
+GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl $(PERLASM_SCHEME)
+GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl $(PERLASM_SCHEME)
+GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl $(PERLASM_SCHEME)
+INCLUDE[ghash-armv4.o]=..
+GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl $(PERLASM_SCHEME)
+INCLUDE[ghashv8-armx.o]=..
+
+BEGINRAW[Makefile]
+# GNU make "catch all"
+{- $builddir -}/ghash-%.S:	{- $sourcedir -}/asm/ghash-%.pl
+	CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
+ENDRAW[Makefile]
--- a/crypto/modes/cbc128.c
+++ b/crypto/modes/cbc128.c
@@ -1,64 +1,16 @@
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
 *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
 */

 #include <openssl/crypto.h>
 #include "modes_lcl.h"
 #include <string.h>

-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
 #if !defined(STRICT_ALIGNMENT) && !defined(PEDANTIC)
 # define STRICT_ALIGNMENT 0
 #endif
@@ -70,8 +22,6 @@ void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
    size_t n;
    const unsigned char *iv = ivec;

-    assert(in && out && key && ivec);
-
 #if !defined(OPENSSL_SMALL_FOOTPRINT)
    if (STRICT_ALIGNMENT &&
        ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
@@ -123,8 +73,6 @@ void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
        unsigned char c[16];
    } tmp;

-    assert(in && out && key && ivec);
-
 #if !defined(OPENSSL_SMALL_FOOTPRINT)
    if (in != out) {
        const unsigned char *iv = ivec;
--- a/crypto/modes/ccm128.c
+++ b/crypto/modes/ccm128.c
@@ -1,63 +1,16 @@
-/* ====================================================================
- * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+/*
+ * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
 *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
 */

 #include <openssl/crypto.h>
 #include "modes_lcl.h"
 #include <string.h>

-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
 /*
 * First you setup M and L parameters and pass the key schedule. This is
 * called once per session setup...
--- a/crypto/modes/cfb128.c
+++ b/crypto/modes/cfb128.c
@@ -1,64 +1,16 @@
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
 *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
 */

 #include <openssl/crypto.h>
 #include "modes_lcl.h"
 #include <string.h>

-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
 /*
 * The input and output encrypted as though 128bit cfb mode is being used.
 * The extra state information to record how much of the 128bit block we have
@@ -72,8 +24,6 @@ void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
    unsigned int n;
    size_t l = 0;

-    assert(in && out && key && ivec && num);
-
    n = *num;

    if (enc) {
@@ -190,7 +140,7 @@ static void cfbr_encrypt_block(const unsigned char *in, unsigned char *out,
                               block128_f block)
 {
    int n, rem, num;
-    unsigned char ovec[16 * 2 + 1]; /* +1 because we dererefence (but don't
+    unsigned char ovec[16 * 2 + 1]; /* +1 because we dereference (but don't
                                     * use) one byte off the end */

    if (nbits <= 0 || nbits > 128)
@@ -228,9 +178,6 @@ void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
    size_t n;
    unsigned char c[1], d[1];

-    assert(in && out && key && ivec && num);
-    assert(*num == 0);
-
    for (n = 0; n < bits; ++n) {
        c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0;
        cfbr_encrypt_block(c, d, 1, key, ivec, enc, block);
@@ -246,9 +193,6 @@ void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
 {
    size_t n;

-    assert(in && out && key && ivec && num);
-    assert(*num == 0);
-
    for (n = 0; n < length; ++n)
        cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);
 }
--- a/crypto/modes/ctr128.c
+++ b/crypto/modes/ctr128.c
@@ -1,64 +1,16 @@
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
 *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
 */

 #include <openssl/crypto.h>
 #include "modes_lcl.h"
 #include <string.h>

-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
 /*
 * NOTE: the IV/counter CTR mode is big-endian.  The code itself is
 * endian-neutral.
@@ -67,23 +19,20 @@
 /* increment counter (128-bit int) by 1 */
 static void ctr128_inc(unsigned char *counter)
 {
-    u32 n = 16;
-    u8 c;
+    u32 n = 16, c = 1;

    do {
        --n;
-        c = counter[n];
-        ++c;
-        counter[n] = c;
-        if (c)
-            return;
+        c += counter[n];
+        counter[n] = (u8)c;
+        c >>= 8;
    } while (n);
 }

 #if !defined(OPENSSL_SMALL_FOOTPRINT)
 static void ctr128_inc_aligned(unsigned char *counter)
 {
-    size_t *data, c, n;
+    size_t *data, c, d, n;
    const union {
        long one;
        char little;
@@ -91,20 +40,19 @@ static void ctr128_inc_aligned(unsigned char *counter)
        1
    };

-    if (is_endian.little) {
+    if (is_endian.little || ((size_t)counter % sizeof(size_t)) != 0) {
        ctr128_inc(counter);
        return;
    }

    data = (size_t *)counter;
+    c = 1;
    n = 16 / sizeof(size_t);
    do {
        --n;
-        c = data[n];
-        ++c;
-        data[n] = c;
-        if (c)
-            return;
+        d = data[n] += c;
+        /* did addition carry? */
+        c = ((d - c) & ~d) >> (sizeof(size_t) * 8 - 1);
    } while (n);
 }
 #endif
@@ -117,7 +65,7 @@ static void ctr128_inc_aligned(unsigned char *counter)
 * before the first call to CRYPTO_ctr128_encrypt(). This algorithm assumes
 * that the counter is in the x lower bits of the IV (ivec), and that the
 * application has full control over overflow and the rest of the IV.  This
- * implementation takes NO responsability for checking that the counter
+ * implementation takes NO responsibility for checking that the counter
 * doesn't overflow into the rest of the IV when incremented.
 */
 void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
@@ -129,9 +77,6 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
    unsigned int n;
    size_t l = 0;

-    assert(in && out && key && ecount_buf && num);
-    assert(*num < 16);
-
    n = *num;

 #if !defined(OPENSSL_SMALL_FOOTPRINT)
@@ -144,14 +89,14 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
            }

 # if defined(STRICT_ALIGNMENT)
-            if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) !=
-                0)
+            if (((size_t)in | (size_t)out | (size_t)ecount_buf)
+                % sizeof(size_t) != 0)
                break;
 # endif
            while (len >= 16) {
                (*block) (ivec, ecount_buf, key);
                ctr128_inc_aligned(ivec);
-                for (; n < 16; n += sizeof(size_t))
+                for (n = 0; n < 16; n += sizeof(size_t))
                    *(size_t *)(out + n) =
                        *(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n);
                len -= 16;
@@ -189,16 +134,13 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
 /* increment upper 96 bits of 128-bit counter by 1 */
 static void ctr96_inc(unsigned char *counter)
 {
-    u32 n = 12;
-    u8 c;
+    u32 n = 12, c = 1;

    do {
        --n;
-        c = counter[n];
-        ++c;
-        counter[n] = c;
-        if (c)
-            return;
+        c += counter[n];
+        counter[n] = (u8)c;
+        c >>= 8;
    } while (n);
 }

@@ -210,9 +152,6 @@ void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
 {
    unsigned int n, ctr32;

-    assert(in && out && key && ecount_buf && num);
-    assert(*num < 16);
-
    n = *num;

    while (n && len) {
@@ -245,7 +184,7 @@ void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
        (*func) (in, out, blocks, key, ivec);
        /* (*ctr) does not update ivec, caller does: */
        PUTU32(ivec + 12, ctr32);
-        /* ... overflow was detected, propogate carry. */
+        /* ... overflow was detected, propagate carry. */
        if (ctr32 == 0)
            ctr96_inc(ivec);
        blocks *= 16;
--- a/crypto/modes/cts128.c
+++ b/crypto/modes/cts128.c
@@ -1,21 +1,16 @@
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
 *
- * Rights for redistribution and usage in source and binary
- * forms are granted according to the OpenSSL license.
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
 */

 #include <openssl/crypto.h>
 #include "modes_lcl.h"
 #include <string.h>

-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
 /*
 * Trouble with Ciphertext Stealing, CTS, mode is that there is no
 * common official specification, but couple of cipher/application
@@ -36,8 +31,6 @@ size_t CRYPTO_cts128_encrypt_block(const unsigned char *in,
 {
    size_t residue, n;

-    assert(in && out && key && ivec);
-
    if (len <= 16)
        return 0;

@@ -68,8 +61,6 @@ size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in,
 {
    size_t residue, n;

-    assert(in && out && key && ivec);
-
    if (len < 16)
        return 0;

@@ -103,8 +94,6 @@ size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
        unsigned char c[16];
    } tmp;

-    assert(in && out && key && ivec);
-
    if (len <= 16)
        return 0;

@@ -141,8 +130,6 @@ size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
        unsigned char c[16];
    } tmp;

-    assert(in && out && key && ivec);
-
    if (len < 16)
        return 0;

@@ -179,8 +166,6 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in,
        unsigned char c[32];
    } tmp;

-    assert(in && out && key && ivec);
-
    if (len <= 16)
        return 0;

@@ -224,8 +209,6 @@ size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in,
        unsigned char c[32];
    } tmp;

-    assert(in && out && key && ivec);
-
    if (len < 16)
        return 0;

@@ -272,8 +255,6 @@ size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
        unsigned char c[32];
    } tmp;

-    assert(in && out && key && ivec);
-
    if (len <= 16)
        return 0;

@@ -314,8 +295,6 @@ size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
        unsigned char c[32];
    } tmp;

-    assert(in && out && key && ivec);
-
    if (len < 16)
        return 0;

--- a/crypto/modes/gcm128.c
+++ b/crypto/modes/gcm128.c
@@ -1,65 +1,16 @@
-/* ====================================================================
- * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+/*
+ * Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
 *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
 */

-#define OPENSSL_FIPSAPI
-
 #include <openssl/crypto.h>
 #include "modes_lcl.h"
 #include <string.h>

-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
 /* redefine, because alignment is ensured */
 # undef  GETU32
@@ -150,9 +101,7 @@ static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
    const union {
        long one;
        char little;
-    } is_endian = {
-        1
-    };
+    } is_endian = { 1 };
    static const size_t rem_8bit[256] = {
        PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
        PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
@@ -321,9 +270,7 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
        const union {
            long one;
            char little;
-        } is_endian = {
-            1
-        };
+        } is_endian = { 1 };

        if (is_endian.little)
            for (j = 0; j < 16; ++j) {
@@ -356,9 +303,7 @@ static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
    const union {
        long one;
        char little;
-    } is_endian = {
-        1
-    };
+    } is_endian = { 1 };

    nlo = ((const u8 *)Xi)[15];
    nhi = nlo >> 4;
@@ -437,9 +382,7 @@ static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
    const union {
        long one;
        char little;
-    } is_endian = {
-        1
-    };
+    } is_endian = { 1 };

 #   if 1
    do {
@@ -629,9 +572,7 @@ static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
    const union {
        long one;
        char little;
-    } is_endian = {
-        1
-    };
+    } is_endian = { 1 };

    V.hi = H[0];                /* H is in host byte order, no byte swapping */
    V.lo = H[1];
@@ -774,9 +715,7 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
    const union {
        long one;
        char little;
-    } is_endian = {
-        1
-    };
+    } is_endian = { 1 };

    memset(ctx, 0, sizeof(*ctx));
    ctx->block = block;
@@ -801,6 +740,11 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
 #if     TABLE_BITS==8
    gcm_init_8bit(ctx->Htable, ctx->H.u);
 #elif   TABLE_BITS==4
+# if    defined(GHASH)
+#  define CTX__GHASH(f) (ctx->ghash = (f))
+# else
+#  define CTX__GHASH(f) (ctx->ghash = NULL)
+# endif
 # if    defined(GHASH_ASM_X86_OR_64)
 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
    if (OPENSSL_ia32cap_P[0] & (1 << 24) && /* check FXSR bit */
@@ -808,11 +752,11 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
        if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
            gcm_init_avx(ctx->Htable, ctx->H.u);
            ctx->gmult = gcm_gmult_avx;
-            ctx->ghash = gcm_ghash_avx;
+            CTX__GHASH(gcm_ghash_avx);
        } else {
            gcm_init_clmul(ctx->Htable, ctx->H.u);
            ctx->gmult = gcm_gmult_clmul;
-            ctx->ghash = gcm_ghash_clmul;
+            CTX__GHASH(gcm_ghash_clmul);
        }
        return;
    }
@@ -825,66 +769,59 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
    if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
 #   endif
        ctx->gmult = gcm_gmult_4bit_mmx;
-        ctx->ghash = gcm_ghash_4bit_mmx;
+        CTX__GHASH(gcm_ghash_4bit_mmx);
    } else {
        ctx->gmult = gcm_gmult_4bit_x86;
-        ctx->ghash = gcm_ghash_4bit_x86;
+        CTX__GHASH(gcm_ghash_4bit_x86);
    }
 #  else
    ctx->gmult = gcm_gmult_4bit;
-    ctx->ghash = gcm_ghash_4bit;
+    CTX__GHASH(gcm_ghash_4bit);
 #  endif
 # elif  defined(GHASH_ASM_ARM)
 #  ifdef PMULL_CAPABLE
    if (PMULL_CAPABLE) {
        gcm_init_v8(ctx->Htable, ctx->H.u);
        ctx->gmult = gcm_gmult_v8;
-        ctx->ghash = gcm_ghash_v8;
+        CTX__GHASH(gcm_ghash_v8);
    } else
 #  endif
 #  ifdef NEON_CAPABLE
    if (NEON_CAPABLE) {
        gcm_init_neon(ctx->Htable, ctx->H.u);
        ctx->gmult = gcm_gmult_neon;
-        ctx->ghash = gcm_ghash_neon;
+        CTX__GHASH(gcm_ghash_neon);
    } else
 #  endif
    {
        gcm_init_4bit(ctx->Htable, ctx->H.u);
        ctx->gmult = gcm_gmult_4bit;
-#  if defined(GHASH)
-        ctx->ghash = gcm_ghash_4bit;
-#  else
-        ctx->ghash = NULL;
-#  endif
+        CTX__GHASH(gcm_ghash_4bit);
    }
 # elif  defined(GHASH_ASM_SPARC)
    if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
        gcm_init_vis3(ctx->Htable, ctx->H.u);
        ctx->gmult = gcm_gmult_vis3;
-        ctx->ghash = gcm_ghash_vis3;
+        CTX__GHASH(gcm_ghash_vis3);
    } else {
        gcm_init_4bit(ctx->Htable, ctx->H.u);
        ctx->gmult = gcm_gmult_4bit;
-        ctx->ghash = gcm_ghash_4bit;
+        CTX__GHASH(gcm_ghash_4bit);
    }
 # elif  defined(GHASH_ASM_PPC)
    if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
        gcm_init_p8(ctx->Htable, ctx->H.u);
        ctx->gmult = gcm_gmult_p8;
-        ctx->ghash = gcm_ghash_p8;
+        CTX__GHASH(gcm_ghash_p8);
    } else {
        gcm_init_4bit(ctx->Htable, ctx->H.u);
        ctx->gmult = gcm_gmult_4bit;
-#  if defined(GHASH)
-        ctx->ghash = gcm_ghash_4bit;
-#  else
-        ctx->ghash = NULL;
-#  endif
+        CTX__GHASH(gcm_ghash_4bit);
    }
 # else
    gcm_init_4bit(ctx->Htable, ctx->H.u);
 # endif
+# undef CTX__GHASH
 #endif
 }

@@ -894,9 +831,7 @@ void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
    const union {
        long one;
        char little;
-    } is_endian = {
-        1
-    };
+    } is_endian = { 1 };
    unsigned int ctr;
 #ifdef GCM_FUNCREF_4BIT
    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
@@ -1040,9 +975,7 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
    const union {
        long one;
        char little;
-    } is_endian = {
-        1
-    };
+    } is_endian = { 1 };
    unsigned int n, ctr;
    size_t i;
    u64 mlen = ctx->len.u[1];
@@ -1050,15 +983,12 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
    void *key = ctx->key;
 #ifdef GCM_FUNCREF_4BIT
    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
+# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
                         const u8 *inp, size_t len) = ctx->ghash;
 # endif
 #endif

-#if 0
-    n = (unsigned int)mlen % 16; /* alternative to ctx->mres */
-#endif
    mlen += len;
    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
        return -1;
@@ -1100,7 +1030,8 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
            if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
                break;
 # endif
-# if defined(GHASH) && defined(GHASH_CHUNK)
+# if defined(GHASH)
+#  if defined(GHASH_CHUNK)
            while (len >= GHASH_CHUNK) {
                size_t j = GHASH_CHUNK;

@@ -1111,11 +1042,11 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
                    ++ctr;
                    if (is_endian.little)
-#  ifdef BSWAP4
+#   ifdef BSWAP4
                        ctx->Yi.d[3] = BSWAP4(ctr);
-#  else
+#   else
                        PUTU32(ctx->Yi.c + 12, ctr);
-#  endif
+#   endif
                    else
                        ctx->Yi.d[3] = ctr;
                    for (i = 0; i < 16 / sizeof(size_t); ++i)
@@ -1127,6 +1058,7 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
                GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
                len -= GHASH_CHUNK;
            }
+#  endif
            if ((i = (len & (size_t)-16))) {
                size_t j = i;

@@ -1227,9 +1159,7 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
    const union {
        long one;
        char little;
-    } is_endian = {
-        1
-    };
+    } is_endian = { 1 };
    unsigned int n, ctr;
    size_t i;
    u64 mlen = ctx->len.u[1];
@@ -1237,7 +1167,7 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
    void *key = ctx->key;
 #ifdef GCM_FUNCREF_4BIT
    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
+# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
                         const u8 *inp, size_t len) = ctx->ghash;
 # endif
@@ -1286,7 +1216,8 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
            if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
                break;
 # endif
-# if defined(GHASH) && defined(GHASH_CHUNK)
+# if defined(GHASH)
+#  if defined(GHASH_CHUNK)
            while (len >= GHASH_CHUNK) {
                size_t j = GHASH_CHUNK;

@@ -1298,11 +1229,11 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
                    ++ctr;
                    if (is_endian.little)
-#  ifdef BSWAP4
+#   ifdef BSWAP4
                        ctx->Yi.d[3] = BSWAP4(ctr);
-#  else
+#   else
                        PUTU32(ctx->Yi.c + 12, ctr);
-#  endif
+#   endif
                    else
                        ctx->Yi.d[3] = ctr;
                    for (i = 0; i < 16 / sizeof(size_t); ++i)
@@ -1313,6 +1244,7 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
                }
                len -= GHASH_CHUNK;
            }
+#  endif
            if ((i = (len & (size_t)-16))) {
                GHASH(ctx, in, i);
                while (len >= 16) {
@@ -1416,23 +1348,24 @@ int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
                                const unsigned char *in, unsigned char *out,
                                size_t len, ctr128_f stream)
 {
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+    return CRYPTO_gcm128_encrypt(ctx, in, out, len);
+#else
    const union {
        long one;
        char little;
-    } is_endian = {
-        1
-    };
+    } is_endian = { 1 };
    unsigned int n, ctr;
    size_t i;
    u64 mlen = ctx->len.u[1];
    void *key = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
+# ifdef GCM_FUNCREF_4BIT
    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
+#  ifdef GHASH
    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
                         const u8 *inp, size_t len) = ctx->ghash;
+#  endif
 # endif
-#endif

    mlen += len;
    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
@@ -1446,11 +1379,11 @@ int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
    }

    if (is_endian.little)
-#ifdef BSWAP4
+# ifdef BSWAP4
        ctr = BSWAP4(ctx->Yi.d[3]);
-#else
+# else
        ctr = GETU32(ctx->Yi.c + 12);
-#endif
+# endif
    else
        ctr = ctx->Yi.d[3];

@@ -1468,60 +1401,60 @@ int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
            return 0;
        }
    }
-#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+# if defined(GHASH) && defined(GHASH_CHUNK)
    while (len >= GHASH_CHUNK) {
        (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
        ctr += GHASH_CHUNK / 16;
        if (is_endian.little)
+#  ifdef BSWAP4
+            ctx->Yi.d[3] = BSWAP4(ctr);
+#  else
+            PUTU32(ctx->Yi.c + 12, ctr);
+#  endif
+        else
+            ctx->Yi.d[3] = ctr;
+        GHASH(ctx, out, GHASH_CHUNK);
+        out += GHASH_CHUNK;
+        in += GHASH_CHUNK;
+        len -= GHASH_CHUNK;
+    }
+# endif
+    if ((i = (len & (size_t)-16))) {
+        size_t j = i / 16;
+
+        (*stream) (in, out, j, key, ctx->Yi.c);
+        ctr += (unsigned int)j;
+        if (is_endian.little)
 # ifdef BSWAP4
            ctx->Yi.d[3] = BSWAP4(ctr);
 # else
            PUTU32(ctx->Yi.c + 12, ctr);
 # endif
-        else
-            ctx->Yi.d[3] = ctr;
-        GHASH(ctx, out, GHASH_CHUNK);
-        out += GHASH_CHUNK;
-        in += GHASH_CHUNK;
-        len -= GHASH_CHUNK;
-    }
-#endif
-    if ((i = (len & (size_t)-16))) {
-        size_t j = i / 16;
-
-        (*stream) (in, out, j, key, ctx->Yi.c);
-        ctr += (unsigned int)j;
-        if (is_endian.little)
-#ifdef BSWAP4
-            ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-            PUTU32(ctx->Yi.c + 12, ctr);
-#endif
        else
            ctx->Yi.d[3] = ctr;
        in += i;
        len -= i;
-#if defined(GHASH)
+# if defined(GHASH)
        GHASH(ctx, out, i);
        out += i;
-#else
+# else
        while (j--) {
            for (i = 0; i < 16; ++i)
                ctx->Xi.c[i] ^= out[i];
            GCM_MUL(ctx, Xi);
            out += 16;
        }
-#endif
+# endif
    }
    if (len) {
        (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
        ++ctr;
        if (is_endian.little)
-#ifdef BSWAP4
+# ifdef BSWAP4
            ctx->Yi.d[3] = BSWAP4(ctr);
-#else
+# else
            PUTU32(ctx->Yi.c + 12, ctr);
-#endif
+# endif
        else
            ctx->Yi.d[3] = ctr;
        while (len--) {
@@ -1532,29 +1465,31 @@ int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,

    ctx->mres = n;
    return 0;
+#endif
 }

 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
                                const unsigned char *in, unsigned char *out,
                                size_t len, ctr128_f stream)
 {
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+    return CRYPTO_gcm128_decrypt(ctx, in, out, len);
+#else
    const union {
        long one;
        char little;
-    } is_endian = {
-        1
-    };
+    } is_endian = { 1 };
    unsigned int n, ctr;
    size_t i;
    u64 mlen = ctx->len.u[1];
    void *key = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
+# ifdef GCM_FUNCREF_4BIT
    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
+#  ifdef GHASH
    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
                         const u8 *inp, size_t len) = ctx->ghash;
+#  endif
 # endif
-#endif

    mlen += len;
    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
@@ -1568,11 +1503,11 @@ int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
    }

    if (is_endian.little)
-#ifdef BSWAP4
+# ifdef BSWAP4
        ctr = BSWAP4(ctx->Yi.d[3]);
-#else
+# else
        ctr = GETU32(ctx->Yi.c + 12);
-#endif
+# endif
    else
        ctr = ctx->Yi.d[3];

@@ -1592,30 +1527,30 @@ int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
            return 0;
        }
    }
-#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+# if defined(GHASH) && defined(GHASH_CHUNK)
    while (len >= GHASH_CHUNK) {
        GHASH(ctx, in, GHASH_CHUNK);
        (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
        ctr += GHASH_CHUNK / 16;
        if (is_endian.little)
-# ifdef BSWAP4
+#  ifdef BSWAP4
            ctx->Yi.d[3] = BSWAP4(ctr);
-# else
+#  else
            PUTU32(ctx->Yi.c + 12, ctr);
-# endif
+#  endif
        else
            ctx->Yi.d[3] = ctr;
        out += GHASH_CHUNK;
        in += GHASH_CHUNK;
        len -= GHASH_CHUNK;
    }
-#endif
+# endif
    if ((i = (len & (size_t)-16))) {
        size_t j = i / 16;

-#if defined(GHASH)
+# if defined(GHASH)
        GHASH(ctx, in, i);
-#else
+# else
        while (j--) {
            size_t k;
            for (k = 0; k < 16; ++k)
@@ -1625,15 +1560,15 @@ int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
        }
        j = i / 16;
        in -= i;
-#endif
+# endif
        (*stream) (in, out, j, key, ctx->Yi.c);
        ctr += (unsigned int)j;
        if (is_endian.little)
-#ifdef BSWAP4
+# ifdef BSWAP4
            ctx->Yi.d[3] = BSWAP4(ctr);
-#else
+# else
            PUTU32(ctx->Yi.c + 12, ctr);
-#endif
+# endif
        else
            ctx->Yi.d[3] = ctr;
        out += i;
@@ -1644,11 +1579,11 @@ int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
        (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
        ++ctr;
        if (is_endian.little)
-#ifdef BSWAP4
+# ifdef BSWAP4
            ctx->Yi.d[3] = BSWAP4(ctr);
-#else
+# else
            PUTU32(ctx->Yi.c + 12, ctr);
-#endif
+# endif
        else
            ctx->Yi.d[3] = ctr;
        while (len--) {
@@ -1661,6 +1596,7 @@ int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,

    ctx->mres = n;
    return 0;
+#endif
 }

 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
@@ -1669,9 +1605,7 @@ int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
    const union {
        long one;
        char little;
-    } is_endian = {
-        1
-    };
+    } is_endian = { 1 };
    u64 alen = ctx->len.u[0] << 3;
    u64 clen = ctx->len.u[1] << 3;
 #ifdef GCM_FUNCREF_4BIT
@@ -1720,7 +1654,7 @@ GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
 {
    GCM128_CONTEXT *ret;

-    if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
+    if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
        CRYPTO_gcm128_init(ret, key, block);

    return ret;
@@ -1728,10 +1662,7 @@ GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)

 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
 {
-    if (ctx) {
-        OPENSSL_cleanse(ctx, sizeof(*ctx));
-        OPENSSL_free(ctx);
-    }
+    OPENSSL_clear_free(ctx, sizeof(*ctx));
 }

 #if defined(SELFTEST)
--- a/crypto/modes/modes.h
+++ b/crypto/modes/modes.h
@@ -1,163 +0,0 @@
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
- *
- * Rights for redistribution and usage in source and binary
- * forms are granted according to the OpenSSL license.
- */
-
-#include <stddef.h>
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-typedef void (*block128_f) (const unsigned char in[16],
-                            unsigned char out[16], const void *key);
-
-typedef void (*cbc128_f) (const unsigned char *in, unsigned char *out,
-                          size_t len, const void *key,
-                          unsigned char ivec[16], int enc);
-
-typedef void (*ctr128_f) (const unsigned char *in, unsigned char *out,
-                          size_t blocks, const void *key,
-                          const unsigned char ivec[16]);
-
-typedef void (*ccm128_f) (const unsigned char *in, unsigned char *out,
-                          size_t blocks, const void *key,
-                          const unsigned char ivec[16],
-                          unsigned char cmac[16]);
-
-void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
-                           size_t len, const void *key,
-                           unsigned char ivec[16], block128_f block);
-void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
-                           size_t len, const void *key,
-                           unsigned char ivec[16], block128_f block);
-
-void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
-                           size_t len, const void *key,
-                           unsigned char ivec[16],
-                           unsigned char ecount_buf[16], unsigned int *num,
-                           block128_f block);
-
-void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
-                                 size_t len, const void *key,
-                                 unsigned char ivec[16],
-                                 unsigned char ecount_buf[16],
-                                 unsigned int *num, ctr128_f ctr);
-
-void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
-                           size_t len, const void *key,
-                           unsigned char ivec[16], int *num,
-                           block128_f block);
-
-void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
-                           size_t len, const void *key,
-                           unsigned char ivec[16], int *num,
-                           int enc, block128_f block);
-void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
-                             size_t length, const void *key,
-                             unsigned char ivec[16], int *num,
-                             int enc, block128_f block);
-void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
-                             size_t bits, const void *key,
-                             unsigned char ivec[16], int *num,
-                             int enc, block128_f block);
-
-size_t CRYPTO_cts128_encrypt_block(const unsigned char *in,
-                                   unsigned char *out, size_t len,
-                                   const void *key, unsigned char ivec[16],
-                                   block128_f block);
-size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
-                             size_t len, const void *key,
-                             unsigned char ivec[16], cbc128_f cbc);
-size_t CRYPTO_cts128_decrypt_block(const unsigned char *in,
-                                   unsigned char *out, size_t len,
-                                   const void *key, unsigned char ivec[16],
-                                   block128_f block);
-size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
-                             size_t len, const void *key,
-                             unsigned char ivec[16], cbc128_f cbc);
-
-size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in,
-                                       unsigned char *out, size_t len,
-                                       const void *key,
-                                       unsigned char ivec[16],
-                                       block128_f block);
-size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
-                                 size_t len, const void *key,
-                                 unsigned char ivec[16], cbc128_f cbc);
-size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in,
-                                       unsigned char *out, size_t len,
-                                       const void *key,
-                                       unsigned char ivec[16],
-                                       block128_f block);
-size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
-                                 size_t len, const void *key,
-                                 unsigned char ivec[16], cbc128_f cbc);
-
-typedef struct gcm128_context GCM128_CONTEXT;
-
-GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
-void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block);
-void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
-                         size_t len);
-int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
-                      size_t len);
-int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
-                          const unsigned char *in, unsigned char *out,
-                          size_t len);
-int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
-                          const unsigned char *in, unsigned char *out,
-                          size_t len);
-int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
-                                const unsigned char *in, unsigned char *out,
-                                size_t len, ctr128_f stream);
-int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
-                                const unsigned char *in, unsigned char *out,
-                                size_t len, ctr128_f stream);
-int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
-                         size_t len);
-void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
-void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
-
-typedef struct ccm128_context CCM128_CONTEXT;
-
-void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
-                        unsigned int M, unsigned int L, void *key,
-                        block128_f block);
-int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, const unsigned char *nonce,
-                        size_t nlen, size_t mlen);
-void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, const unsigned char *aad,
-                       size_t alen);
-int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, const unsigned char *inp,
-                          unsigned char *out, size_t len);
-int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, const unsigned char *inp,
-                          unsigned char *out, size_t len);
-int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, const unsigned char *inp,
-                                unsigned char *out, size_t len,
-                                ccm128_f stream);
-int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, const unsigned char *inp,
-                                unsigned char *out, size_t len,
-                                ccm128_f stream);
-size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
-
-typedef struct xts128_context XTS128_CONTEXT;
-
-int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
-                          const unsigned char iv[16],
-                          const unsigned char *inp, unsigned char *out,
-                          size_t len, int enc);
-
-size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
-                       unsigned char *out,
-                       const unsigned char *in, size_t inlen,
-                       block128_f block);
-
-size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
-                         unsigned char *out,
-                         const unsigned char *in, size_t inlen,
-                         block128_f block);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/crypto/modes/modes_lcl.h
+++ b/crypto/modes/modes_lcl.h
@@ -1,8 +1,10 @@
-/* ====================================================================
- * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+/*
+ * Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
 *
- * Redistribution and use is governed by OpenSSL license.
- * ====================================================================
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
 */

 #include <openssl/modes.h>
@@ -141,3 +143,43 @@ struct ccm128_context {
    block128_f block;
    void *key;
 };
+
+#ifndef OPENSSL_NO_OCB
+
+typedef union {
+    u64 a[2];
+    unsigned char c[16];
+} OCB_BLOCK;
+# define ocb_block16_xor(in1,in2,out) \
+    ( (out)->a[0]=(in1)->a[0]^(in2)->a[0], \
+      (out)->a[1]=(in1)->a[1]^(in2)->a[1] )
+# if STRICT_ALIGNMENT
+#  define ocb_block16_xor_misaligned(in1,in2,out) \
+    ocb_block_xor((in1)->c,(in2)->c,16,(out)->c)
+# else
+#  define ocb_block16_xor_misaligned ocb_block16_xor
+# endif
+
+struct ocb128_context {
+    /* Need both encrypt and decrypt key schedules for decryption */
+    block128_f encrypt;
+    block128_f decrypt;
+    void *keyenc;
+    void *keydec;
+    ocb128_f stream;    /* direction dependent */
+    /* Key dependent variables. Can be reused if key remains the same */
+    size_t l_index;
+    size_t max_l_index;
+    OCB_BLOCK l_star;
+    OCB_BLOCK l_dollar;
+    OCB_BLOCK *l;
+    /* Must be reset for each session */
+    u64 blocks_hashed;
+    u64 blocks_processed;
+    OCB_BLOCK tag;
+    OCB_BLOCK offset_aad;
+    OCB_BLOCK sum;
+    OCB_BLOCK offset;
+    OCB_BLOCK checksum;
+};
+#endif                          /* OPENSSL_NO_OCB */
--- a/crypto/modes/ocb128.c
+++ b/crypto/modes/ocb128.c
@@ -0,0 +1,568 @@
+/*
+ * Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <string.h>
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+
+#ifndef OPENSSL_NO_OCB
+
+/*
+ * Calculate the number of binary trailing zero's in any given number
+ */
+static u32 ocb_ntz(u64 n)
+{
+    u32 cnt = 0;
+
+    /*
+     * We do a right-to-left simple sequential search. This is surprisingly
+     * efficient as the distribution of trailing zeros is not uniform,
+     * e.g. the number of possible inputs with no trailing zeros is equal to
+     * the number with 1 or more; the number with exactly 1 is equal to the
+     * number with 2 or more, etc. Checking the last two bits covers 75% of
+     * all numbers. Checking the last three covers 87.5%
+     */
+    while (!(n & 1)) {
+        n >>= 1;
+        cnt++;
+    }
+    return cnt;
+}
+
+/*
+ * Shift a block of 16 bytes left by shift bits
+ */
+static void ocb_block_lshift(const unsigned char *in, size_t shift,
+                             unsigned char *out)
+{
+    unsigned char shift_mask;
+    int i;
+    unsigned char mask[15];
+
+    shift_mask = 0xff;
+    shift_mask <<= (8 - shift);
+    for (i = 15; i >= 0; i--) {
+        if (i > 0) {
+            mask[i - 1] = in[i] & shift_mask;
+            mask[i - 1] >>= 8 - shift;
+        }
+        out[i] = in[i] << shift;
+
+        if (i != 15) {
+            out[i] ^= mask[i];
+        }
+    }
+}
+
+/*
+ * Perform a "double" operation as per OCB spec
+ */
+static void ocb_double(OCB_BLOCK *in, OCB_BLOCK *out)
+{
+    unsigned char mask;
+
+    /*
+     * Calculate the mask based on the most significant bit. There are more
+     * efficient ways to do this - but this way is constant time
+     */
+    mask = in->c[0] & 0x80;
+    mask >>= 7;
+    mask *= 135;
+
+    ocb_block_lshift(in->c, 1, out->c);
+
+    out->c[15] ^= mask;
+}
+
+/*
+ * Perform an xor on in1 and in2 - each of len bytes. Store result in out
+ */
+static void ocb_block_xor(const unsigned char *in1,
+                          const unsigned char *in2, size_t len,
+                          unsigned char *out)
+{
+    size_t i;
+    for (i = 0; i < len; i++) {
+        out[i] = in1[i] ^ in2[i];
+    }
+}
+
+/*
+ * Lookup L_index in our lookup table. If we haven't already got it we need to
+ * calculate it
+ */
+static OCB_BLOCK *ocb_lookup_l(OCB128_CONTEXT *ctx, size_t idx)
+{
+    size_t l_index = ctx->l_index;
+
+    if (idx <= l_index) {
+        return ctx->l + idx;
+    }
+
+    /* We don't have it - so calculate it */
+    if (idx >= ctx->max_l_index) {
+        void *tmp_ptr;
+        /*
+         * Each additional entry allows to process almost double as
+         * much data, so that in linear world the table will need to
+         * be expanded with smaller and smaller increments. Originally
+         * it was doubling in size, which was a waste. Growing it
+         * linearly is not formally optimal, but is simpler to implement.
+         * We grow table by minimally required 4*n that would accommodate
+         * the index.
+         */
+        ctx->max_l_index += (idx - ctx->max_l_index + 4) & ~3;
+        tmp_ptr =
+            OPENSSL_realloc(ctx->l, ctx->max_l_index * sizeof(OCB_BLOCK));
+        if (tmp_ptr == NULL) /* prevent ctx->l from being clobbered */
+            return NULL;
+        ctx->l = tmp_ptr;
+    }
+    while (l_index < idx) {
+        ocb_double(ctx->l + l_index, ctx->l + l_index + 1);
+        l_index++;
+    }
+    ctx->l_index = l_index;
+
+    return ctx->l + idx;
+}
+
+/*
+ * Create a new OCB128_CONTEXT
+ */
+OCB128_CONTEXT *CRYPTO_ocb128_new(void *keyenc, void *keydec,
+                                  block128_f encrypt, block128_f decrypt,
+                                  ocb128_f stream)
+{
+    OCB128_CONTEXT *octx;
+    int ret;
+
+    if ((octx = OPENSSL_malloc(sizeof(*octx))) != NULL) {
+        ret = CRYPTO_ocb128_init(octx, keyenc, keydec, encrypt, decrypt,
+                                 stream);
+        if (ret)
+            return octx;
+        OPENSSL_free(octx);
+    }
+
+    return NULL;
+}
+
+/*
+ * Initialise an existing OCB128_CONTEXT
+ */
+int CRYPTO_ocb128_init(OCB128_CONTEXT *ctx, void *keyenc, void *keydec,
+                       block128_f encrypt, block128_f decrypt,
+                       ocb128_f stream)
+{
+    memset(ctx, 0, sizeof(*ctx));
+    ctx->l_index = 0;
+    ctx->max_l_index = 5;
+    ctx->l = OPENSSL_malloc(ctx->max_l_index * 16);
+    if (ctx->l == NULL)
+        return 0;
+
+    /*
+     * We set both the encryption and decryption key schedules - decryption
+     * needs both. Don't really need decryption schedule if only doing
+     * encryption - but it simplifies things to take it anyway
+     */
+    ctx->encrypt = encrypt;
+    ctx->decrypt = decrypt;
+    ctx->stream = stream;
+    ctx->keyenc = keyenc;
+    ctx->keydec = keydec;
+
+    /* L_* = ENCIPHER(K, zeros(128)) */
+    ctx->encrypt(ctx->l_star.c, ctx->l_star.c, ctx->keyenc);
+
+    /* L_$ = double(L_*) */
+    ocb_double(&ctx->l_star, &ctx->l_dollar);
+
+    /* L_0 = double(L_$) */
+    ocb_double(&ctx->l_dollar, ctx->l);
+
+    /* L_{i} = double(L_{i-1}) */
+    ocb_double(ctx->l, ctx->l+1);
+    ocb_double(ctx->l+1, ctx->l+2);
+    ocb_double(ctx->l+2, ctx->l+3);
+    ocb_double(ctx->l+3, ctx->l+4);
+    ctx->l_index = 4;   /* enough to process up to 496 bytes */
+
+    return 1;
+}
+
+/*
+ * Copy an OCB128_CONTEXT object
+ */
+int CRYPTO_ocb128_copy_ctx(OCB128_CONTEXT *dest, OCB128_CONTEXT *src,
+                           void *keyenc, void *keydec)
+{
+    memcpy(dest, src, sizeof(OCB128_CONTEXT));
+    if (keyenc)
+        dest->keyenc = keyenc;
+    if (keydec)
+        dest->keydec = keydec;
+    if (src->l) {
+        dest->l = OPENSSL_malloc(src->max_l_index * 16);
+        if (dest->l == NULL)
+            return 0;
+        memcpy(dest->l, src->l, (src->l_index + 1) * 16);
+    }
+    return 1;
+}
+
+/*
+ * Set the IV to be used for this operation. Must be 1 - 15 bytes.
+ */
+int CRYPTO_ocb128_setiv(OCB128_CONTEXT *ctx, const unsigned char *iv,
+                        size_t len, size_t taglen)
+{
+    unsigned char ktop[16], tmp[16], mask;
+    unsigned char stretch[24], nonce[16];
+    size_t bottom, shift;
+
+    /*
+     * Spec says IV is 120 bits or fewer - it allows non byte aligned lengths.
+     * We don't support this at this stage
+     */
+    if ((len > 15) || (len < 1) || (taglen > 16) || (taglen < 1)) {
+        return -1;
+    }
+
+    /* Nonce = num2str(TAGLEN mod 128,7) || zeros(120-bitlen(N)) || 1 || N */
+    nonce[0] = ((taglen * 8) % 128) << 1;
+    memset(nonce + 1, 0, 15);
+    memcpy(nonce + 16 - len, iv, len);
+    nonce[15 - len] |= 1;
+
+    /* Ktop = ENCIPHER(K, Nonce[1..122] || zeros(6)) */
+    memcpy(tmp, nonce, 16);
+    tmp[15] &= 0xc0;
+    ctx->encrypt(tmp, ktop, ctx->keyenc);
+
+    /* Stretch = Ktop || (Ktop[1..64] xor Ktop[9..72]) */
+    memcpy(stretch, ktop, 16);
+    ocb_block_xor(ktop, ktop + 1, 8, stretch + 16);
+
+    /* bottom = str2num(Nonce[123..128]) */
+    bottom = nonce[15] & 0x3f;
+
+    /* Offset_0 = Stretch[1+bottom..128+bottom] */
+    shift = bottom % 8;
+    ocb_block_lshift(stretch + (bottom / 8), shift, ctx->offset.c);
+    mask = 0xff;
+    mask <<= 8 - shift;
+    ctx->offset.c[15] |=
+        (*(stretch + (bottom / 8) + 16) & mask) >> (8 - shift);
+
+    return 1;
+}
+
+/*
+ * Provide any AAD. This can be called multiple times. Only the final time can
+ * have a partial block
+ */
+int CRYPTO_ocb128_aad(OCB128_CONTEXT *ctx, const unsigned char *aad,
+                      size_t len)
+{
+    u64 i, all_num_blocks;
+    size_t num_blocks, last_len;
+    OCB_BLOCK tmp1;
+    OCB_BLOCK tmp2;
+
+    /* Calculate the number of blocks of AAD provided now, and so far */
+    num_blocks = len / 16;
+    all_num_blocks = num_blocks + ctx->blocks_hashed;
+
+    /* Loop through all full blocks of AAD */
+    for (i = ctx->blocks_hashed + 1; i <= all_num_blocks; i++) {
+        OCB_BLOCK *lookup;
+        OCB_BLOCK *aad_block;
+
+        /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+        lookup = ocb_lookup_l(ctx, ocb_ntz(i));
+        if (lookup == NULL)
+            return 0;
+        ocb_block16_xor(&ctx->offset_aad, lookup, &ctx->offset_aad);
+
+        /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+        aad_block = (OCB_BLOCK *)(aad + ((i - ctx->blocks_hashed - 1) * 16));
+        ocb_block16_xor(&ctx->offset_aad, aad_block, &tmp1);
+        ctx->encrypt(tmp1.c, tmp2.c, ctx->keyenc);
+        ocb_block16_xor(&ctx->sum, &tmp2, &ctx->sum);
+    }
+
+    /*
+     * Check if we have any partial blocks left over. This is only valid in the
+     * last call to this function
+     */
+    last_len = len % 16;
+
+    if (last_len > 0) {
+        /* Offset_* = Offset_m xor L_* */
+        ocb_block16_xor(&ctx->offset_aad, &ctx->l_star, &ctx->offset_aad);
+
+        /* CipherInput = (A_* || 1 || zeros(127-bitlen(A_*))) xor Offset_* */
+        memset(&tmp1, 0, 16);
+        memcpy(&tmp1, aad + (num_blocks * 16), last_len);
+        ((unsigned char *)&tmp1)[last_len] = 0x80;
+        ocb_block16_xor(&ctx->offset_aad, &tmp1, &tmp2);
+
+        /* Sum = Sum_m xor ENCIPHER(K, CipherInput) */
+        ctx->encrypt(tmp2.c, tmp1.c, ctx->keyenc);
+        ocb_block16_xor(&ctx->sum, &tmp1, &ctx->sum);
+    }
+
+    ctx->blocks_hashed = all_num_blocks;
+
+    return 1;
+}
+
+/*
+ * Provide any data to be encrypted. This can be called multiple times. Only
+ * the final time can have a partial block
+ */
+int CRYPTO_ocb128_encrypt(OCB128_CONTEXT *ctx,
+                          const unsigned char *in, unsigned char *out,
+                          size_t len)
+{
+    u64 i, all_num_blocks;
+    size_t num_blocks, last_len;
+    OCB_BLOCK tmp1;
+    OCB_BLOCK tmp2;
+    OCB_BLOCK pad;
+
+    /*
+     * Calculate the number of blocks of data to be encrypted provided now, and
+     * so far
+     */
+    num_blocks = len / 16;
+    all_num_blocks = num_blocks + ctx->blocks_processed;
+
+    if (num_blocks && all_num_blocks == (size_t)all_num_blocks
+        && ctx->stream != NULL) {
+        size_t max_idx = 0, top = (size_t)all_num_blocks;
+
+        /*
+         * See how many L_{i} entries we need to process data at hand
+         * and pre-compute missing entries in the table [if any]...
+         */
+        while (top >>= 1)
+            max_idx++;
+        if (ocb_lookup_l(ctx, max_idx) == NULL)
+            return 0;
+
+        ctx->stream(in, out, num_blocks, ctx->keyenc,
+                    (size_t)ctx->blocks_processed + 1, ctx->offset.c,
+                    (const unsigned char (*)[16])ctx->l, ctx->checksum.c);
+    } else {
+        /* Loop through all full blocks to be encrypted */
+        for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) {
+            OCB_BLOCK *lookup;
+            OCB_BLOCK *inblock;
+            OCB_BLOCK *outblock;
+
+            /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+            lookup = ocb_lookup_l(ctx, ocb_ntz(i));
+            if (lookup == NULL)
+                return 0;
+            ocb_block16_xor(&ctx->offset, lookup, &ctx->offset);
+
+            /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+            inblock =
+                (OCB_BLOCK *)(in + ((i - ctx->blocks_processed - 1) * 16));
+            ocb_block16_xor_misaligned(&ctx->offset, inblock, &tmp1);
+            /* Checksum_i = Checksum_{i-1} xor P_i */
+            ocb_block16_xor_misaligned(&ctx->checksum, inblock, &ctx->checksum);
+            ctx->encrypt(tmp1.c, tmp2.c, ctx->keyenc);
+            outblock =
+                (OCB_BLOCK *)(out + ((i - ctx->blocks_processed - 1) * 16));
+            ocb_block16_xor_misaligned(&ctx->offset, &tmp2, outblock);
+        }
+    }
+
+    /*
+     * Check if we have any partial blocks left over. This is only valid in the
+     * last call to this function
+     */
+    last_len = len % 16;
+
+    if (last_len > 0) {
+        /* Offset_* = Offset_m xor L_* */
+        ocb_block16_xor(&ctx->offset, &ctx->l_star, &ctx->offset);
+
+        /* Pad = ENCIPHER(K, Offset_*) */
+        ctx->encrypt(ctx->offset.c, pad.c, ctx->keyenc);
+
+        /* C_* = P_* xor Pad[1..bitlen(P_*)] */
+        ocb_block_xor(in + (len / 16) * 16, (unsigned char *)&pad, last_len,
+                      out + (num_blocks * 16));
+
+        /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
+        memset(&tmp1, 0, 16);
+        memcpy(&tmp1, in + (len / 16) * 16, last_len);
+        ((unsigned char *)(&tmp1))[last_len] = 0x80;
+        ocb_block16_xor(&ctx->checksum, &tmp1, &ctx->checksum);
+    }
+
+    ctx->blocks_processed = all_num_blocks;
+
+    return 1;
+}
+
+/*
+ * Provide any data to be decrypted. This can be called multiple times. Only
+ * the final time can have a partial block
+ */
+int CRYPTO_ocb128_decrypt(OCB128_CONTEXT *ctx,
+                          const unsigned char *in, unsigned char *out,
+                          size_t len)
+{
+    u64 i, all_num_blocks;
+    size_t num_blocks, last_len;
+    OCB_BLOCK tmp1;
+    OCB_BLOCK tmp2;
+    OCB_BLOCK pad;
+
+    /*
+     * Calculate the number of blocks of data to be decrypted provided now, and
+     * so far
+     */
+    num_blocks = len / 16;
+    all_num_blocks = num_blocks + ctx->blocks_processed;
+
+    if (num_blocks && all_num_blocks == (size_t)all_num_blocks
+        && ctx->stream != NULL) {
+        size_t max_idx = 0, top = (size_t)all_num_blocks;
+
+        /*
+         * See how many L_{i} entries we need to process data at hand
+         * and pre-compute missing entries in the table [if any]...
+         */
+        while (top >>= 1)
+            max_idx++;
+        if (ocb_lookup_l(ctx, max_idx) == NULL)
+            return 0;
+
+        ctx->stream(in, out, num_blocks, ctx->keydec,
+                    (size_t)ctx->blocks_processed + 1, ctx->offset.c,
+                    (const unsigned char (*)[16])ctx->l, ctx->checksum.c);
+    } else {
+        /* Loop through all full blocks to be decrypted */
+        for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) {
+            OCB_BLOCK *inblock;
+            OCB_BLOCK *outblock;
+
+            /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+            OCB_BLOCK *lookup = ocb_lookup_l(ctx, ocb_ntz(i));
+            if (lookup == NULL)
+                return 0;
+            ocb_block16_xor(&ctx->offset, lookup, &ctx->offset);
+
+            /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+            inblock =
+                (OCB_BLOCK *)(in + ((i - ctx->blocks_processed - 1) * 16));
+            ocb_block16_xor_misaligned(&ctx->offset, inblock, &tmp1);
+            ctx->decrypt(tmp1.c, tmp2.c, ctx->keydec);
+            outblock =
+                (OCB_BLOCK *)(out + ((i - ctx->blocks_processed - 1) * 16));
+            ocb_block16_xor_misaligned(&ctx->offset, &tmp2, outblock);
+
+            /* Checksum_i = Checksum_{i-1} xor P_i */
+            ocb_block16_xor_misaligned(&ctx->checksum, outblock, &ctx->checksum);
+        }
+    }
+
+    /*
+     * Check if we have any partial blocks left over. This is only valid in the
+     * last call to this function
+     */
+    last_len = len % 16;
+
+    if (last_len > 0) {
+        /* Offset_* = Offset_m xor L_* */
+        ocb_block16_xor(&ctx->offset, &ctx->l_star, &ctx->offset);
+
+        /* Pad = ENCIPHER(K, Offset_*) */
+        ctx->encrypt(ctx->offset.c, pad.c, ctx->keyenc);
+
+        /* P_* = C_* xor Pad[1..bitlen(C_*)] */
+        ocb_block_xor(in + (len / 16) * 16, (unsigned char *)&pad, last_len,
+                      out + (num_blocks * 16));
+
+        /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
+        memset(&tmp1, 0, 16);
+        memcpy(&tmp1, out + (len / 16) * 16, last_len);
+        ((unsigned char *)(&tmp1))[last_len] = 0x80;
+        ocb_block16_xor(&ctx->checksum, &tmp1, &ctx->checksum);
+    }
+
+    ctx->blocks_processed = all_num_blocks;
+
+    return 1;
+}
+
+/*
+ * Calculate the tag and verify it against the supplied tag
+ */
+int CRYPTO_ocb128_finish(OCB128_CONTEXT *ctx, const unsigned char *tag,
+                         size_t len)
+{
+    OCB_BLOCK tmp1, tmp2;
+
+    /*
+     * Tag = ENCIPHER(K, Checksum_* xor Offset_* xor L_$) xor HASH(K,A)
+     */
+    ocb_block16_xor(&ctx->checksum, &ctx->offset, &tmp1);
+    ocb_block16_xor(&tmp1, &ctx->l_dollar, &tmp2);
+    ctx->encrypt(tmp2.c, tmp1.c, ctx->keyenc);
+    ocb_block16_xor(&tmp1, &ctx->sum, &ctx->tag);
+
+    if (len > 16 || len < 1) {
+        return -1;
+    }
+
+    /* Compare the tag if we've been given one */
+    if (tag)
+        return CRYPTO_memcmp(&ctx->tag, tag, len);
+    else
+        return -1;
+}
+
+/*
+ * Retrieve the calculated tag
+ */
+int CRYPTO_ocb128_tag(OCB128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+    if (len > 16 || len < 1) {
+        return -1;
+    }
+
+    /* Calculate the tag */
+    CRYPTO_ocb128_finish(ctx, NULL, 0);
+
+    /* Copy the tag into the supplied buffer */
+    memcpy(tag, &ctx->tag, len);
+
+    return 1;
+}
+
+/*
+ * Release all resources
+ */
+void CRYPTO_ocb128_cleanup(OCB128_CONTEXT *ctx)
+{
+    if (ctx) {
+        OPENSSL_clear_free(ctx->l, ctx->max_l_index * 16);
+        OPENSSL_cleanse(ctx, sizeof(*ctx));
+    }
+}
+
+#endif                          /* OPENSSL_NO_OCB */
--- a/crypto/modes/ofb128.c
+++ b/crypto/modes/ofb128.c
@@ -1,64 +1,16 @@
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
 *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
 */

 #include <openssl/crypto.h>
 #include "modes_lcl.h"
 #include <string.h>

-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
 /*
 * The input and output encrypted as though 128bit ofb mode is being used.
 * The extra state information to record how much of the 128bit block we have
@@ -71,8 +23,6 @@ void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
    unsigned int n;
    size_t l = 0;

-    assert(in && out && key && ivec && num);
-
    n = *num;

 #if !defined(OPENSSL_SMALL_FOOTPRINT)
--- a/crypto/modes/wrap128.c
+++ b/crypto/modes/wrap128.c
@@ -1,70 +1,50 @@
-/* crypto/modes/wrap128.c */
 /*
- * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
- * project.
- */
-/* ====================================================================
- * Copyright (c) 2013 The OpenSSL Project.  All rights reserved.
+ * Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
 *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    licensing@OpenSSL.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
 */

-#include "cryptlib.h"
+/**  Beware!
+ *
+ *  Following wrapping modes were designed for AES but this implementation
+ *  allows you to use them for any 128 bit block cipher.
+ */
+
+#include "internal/cryptlib.h"
 #include <openssl/modes.h>

+/** RFC 3394 section 2.2.3.1 Default Initial Value */
 static const unsigned char default_iv[] = {
    0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
 };

-/*
- * Input size limit: lower than maximum of standards but far larger than
- * anything that will be used in practice.
+/** RFC 5649 section 3 Alternative Initial Value 32-bit constant */
+static const unsigned char default_aiv[] = {
+    0xA6, 0x59, 0x59, 0xA6
+};
+
+/** Input size limit: lower than maximum of standards but far larger than
+ *  anything that will be used in practice.
 */
 #define CRYPTO128_WRAP_MAX (1UL << 31)

+/** Wrapping according to RFC 3394 section 2.2.1.
+ *
+ *  @param[in]  key    Key value.
+ *  @param[in]  iv     IV value. Length = 8 bytes. NULL = use default_iv.
+ *  @param[in]  in     Plaintext as n 64-bit blocks, n >= 2.
+ *  @param[in]  inlen  Length of in.
+ *  @param[out] out    Ciphertext. Minimal buffer length = (inlen + 8) bytes.
+ *                     Input and output buffers can overlap if block function
+ *                     supports that.
+ *  @param[in]  block  Block processing function.
+ *  @return            0 if inlen does not consist of n 64-bit blocks, n >= 2.
+ *                     or if inlen > CRYPTO128_WRAP_MAX.
+ *                     Output length if wrapping succeeded.
+ */
 size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
                       unsigned char *out,
                       const unsigned char *in, size_t inlen,
@@ -72,11 +52,11 @@ size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
 {
    unsigned char *A, B[16], *R;
    size_t i, j, t;
-    if ((inlen & 0x7) || (inlen < 8) || (inlen > CRYPTO128_WRAP_MAX))
+    if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX))
        return 0;
    A = B;
    t = 1;
-    memcpy(out + 8, in, inlen);
+    memmove(out + 8, in, inlen);
    if (!iv)
        iv = default_iv;

@@ -100,10 +80,26 @@ size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
    return inlen + 8;
 }

-size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
-                         unsigned char *out,
-                         const unsigned char *in, size_t inlen,
-                         block128_f block)
+/** Unwrapping according to RFC 3394 section 2.2.2 steps 1-2.
+ *  The IV check (step 3) is responsibility of the caller.
+ *
+ *  @param[in]  key    Key value.
+ *  @param[out] iv     Unchecked IV value. Minimal buffer length = 8 bytes.
+ *  @param[out] out    Plaintext without IV.
+ *                     Minimal buffer length = (inlen - 8) bytes.
+ *                     Input and output buffers can overlap if block function
+ *                     supports that.
+ *  @param[in]  in     Ciphertext as n 64-bit blocks.
+ *  @param[in]  inlen  Length of in.
+ *  @param[in]  block  Block processing function.
+ *  @return            0 if inlen is out of range [24, CRYPTO128_WRAP_MAX]
+ *                     or if inlen is not a multiple of 8.
+ *                     Output length otherwise.
+ */
+static size_t crypto_128_unwrap_raw(void *key, unsigned char *iv,
+                                    unsigned char *out,
+                                    const unsigned char *in, size_t inlen,
+                                    block128_f block)
 {
    unsigned char *A, B[16], *R;
    size_t i, j, t;
@@ -113,7 +109,7 @@ size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
    A = B;
    t = 6 * (inlen >> 3);
    memcpy(A, in, 8);
-    memcpy(out, in + 8, inlen);
+    memmove(out, in + 8, inlen);
    for (j = 0; j < 6; j++) {
        R = out + inlen - 8;
        for (i = 0; i < inlen; i += 8, t--, R -= 8) {
@@ -128,11 +124,206 @@ size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
            memcpy(R, B + 8, 8);
        }
    }
+    memcpy(iv, A, 8);
+    return inlen;
+}
+
+/** Unwrapping according to RFC 3394 section 2.2.2, including the IV check.
+ *  The first block of plaintext has to match the supplied IV, otherwise an
+ *  error is returned.
+ *
+ *  @param[in]  key    Key value.
+ *  @param[out] iv     IV value to match against. Length = 8 bytes.
+ *                     NULL = use default_iv.
+ *  @param[out] out    Plaintext without IV.
+ *                     Minimal buffer length = (inlen - 8) bytes.
+ *                     Input and output buffers can overlap if block function
+ *                     supports that.
+ *  @param[in]  in     Ciphertext as n 64-bit blocks.
+ *  @param[in]  inlen  Length of in.
+ *  @param[in]  block  Block processing function.
+ *  @return            0 if inlen is out of range [24, CRYPTO128_WRAP_MAX]
+ *                     or if inlen is not a multiple of 8
+ *                     or if IV doesn't match expected value.
+ *                     Output length otherwise.
+ */
+size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
+                         unsigned char *out, const unsigned char *in,
+                         size_t inlen, block128_f block)
+{
+    size_t ret;
+    unsigned char got_iv[8];
+
+    ret = crypto_128_unwrap_raw(key, got_iv, out, in, inlen, block);
+    if (ret == 0)
+        return 0;
+
    if (!iv)
        iv = default_iv;
-    if (memcmp(A, iv, 8)) {
+    if (CRYPTO_memcmp(got_iv, iv, 8)) {
+        OPENSSL_cleanse(out, ret);
+        return 0;
+    }
+    return ret;
+}
+
+/** Wrapping according to RFC 5649 section 4.1.
+ *
+ *  @param[in]  key    Key value.
+ *  @param[in]  icv    (Non-standard) IV, 4 bytes. NULL = use default_aiv.
+ *  @param[out] out    Ciphertext. Minimal buffer length = (inlen + 15) bytes.
+ *                     Input and output buffers can overlap if block function
+ *                     supports that.
+ *  @param[in]  in     Plaintext as n 64-bit blocks, n >= 2.
+ *  @param[in]  inlen  Length of in.
+ *  @param[in]  block  Block processing function.
+ *  @return            0 if inlen is out of range [1, CRYPTO128_WRAP_MAX].
+ *                     Output length if wrapping succeeded.
+ */
+size_t CRYPTO_128_wrap_pad(void *key, const unsigned char *icv,
+                           unsigned char *out,
+                           const unsigned char *in, size_t inlen,
+                           block128_f block)
+{
+    /* n: number of 64-bit blocks in the padded key data
+     *
+     * If length of plain text is not a multiple of 8, pad the plain text octet
+     * string on the right with octets of zeros, where final length is the
+     * smallest multiple of 8 that is greater than length of plain text.
+     * If length of plain text is a multiple of 8, then there is no padding. */
+    const size_t blocks_padded = (inlen + 7) / 8; /* CEILING(m/8) */
+    const size_t padded_len = blocks_padded * 8;
+    const size_t padding_len = padded_len - inlen;
+    /* RFC 5649 section 3: Alternative Initial Value */
+    unsigned char aiv[8];
+    int ret;
+
+    /* Section 1: use 32-bit fixed field for plaintext octet length */
+    if (inlen == 0 || inlen >= CRYPTO128_WRAP_MAX)
+        return 0;
+
+    /* Section 3: Alternative Initial Value */
+    if (!icv)
+        memcpy(aiv, default_aiv, 4);
+    else
+        memcpy(aiv, icv, 4);    /* Standard doesn't mention this. */
+
+    aiv[4] = (inlen >> 24) & 0xFF;
+    aiv[5] = (inlen >> 16) & 0xFF;
+    aiv[6] = (inlen >> 8) & 0xFF;
+    aiv[7] = inlen & 0xFF;
+
+    if (padded_len == 8) {
+        /*
+         * Section 4.1 - special case in step 2: If the padded plaintext
+         * contains exactly eight octets, then prepend the AIV and encrypt
+         * the resulting 128-bit block using AES in ECB mode.
+         */
+        memmove(out + 8, in, inlen);
+        memcpy(out, aiv, 8);
+        memset(out + 8 + inlen, 0, padding_len);
+        block(out, out, key);
+        ret = 16;               /* AIV + padded input */
+    } else {
+        memmove(out, in, inlen);
+        memset(out + inlen, 0, padding_len); /* Section 4.1 step 1 */
+        ret = CRYPTO_128_wrap(key, aiv, out, out, padded_len, block);
+    }
+
+    return ret;
+}
+
+/** Unwrapping according to RFC 5649 section 4.2.
+ *
+ *  @param[in]  key    Key value.
+ *  @param[in]  icv    (Non-standard) IV, 4 bytes. NULL = use default_aiv.
+ *  @param[out] out    Plaintext. Minimal buffer length = inlen bytes.
+ *                     Input and output buffers can overlap if block function
+ *                     supports that.
+ *  @param[in]  in     Ciphertext as n 64-bit blocks.
+ *  @param[in]  inlen  Length of in.
+ *  @param[in]  block  Block processing function.
+ *  @return            0 if inlen is out of range [16, CRYPTO128_WRAP_MAX],
+ *                     or if inlen is not a multiple of 8
+ *                     or if IV and message length indicator doesn't match.
+ *                     Output length if unwrapping succeeded and IV matches.
+ */
+size_t CRYPTO_128_unwrap_pad(void *key, const unsigned char *icv,
+                             unsigned char *out,
+                             const unsigned char *in, size_t inlen,
+                             block128_f block)
+{
+    /* n: number of 64-bit blocks in the padded key data */
+    size_t n = inlen / 8 - 1;
+    size_t padded_len;
+    size_t padding_len;
+    size_t ptext_len;
+    /* RFC 5649 section 3: Alternative Initial Value */
+    unsigned char aiv[8];
+    static unsigned char zeros[8] = { 0x0 };
+    size_t ret;
+
+    /* Section 4.2: Ciphertext length has to be (n+1) 64-bit blocks. */
+    if ((inlen & 0x7) != 0 || inlen < 16 || inlen >= CRYPTO128_WRAP_MAX)
+        return 0;
+
+    memmove(out, in, inlen);
+    if (inlen == 16) {
+        /*
+         * Section 4.2 - special case in step 1: When n=1, the ciphertext
+         * contains exactly two 64-bit blocks and they are decrypted as a
+         * single AES block using AES in ECB mode: AIV | P[1] = DEC(K, C[0] |
+         * C[1])
+         */
+        block(out, out, key);
+        memcpy(aiv, out, 8);
+        /* Remove AIV */
+        memmove(out, out + 8, 8);
+        padded_len = 8;
+    } else {
+        padded_len = inlen - 8;
+        ret = crypto_128_unwrap_raw(key, aiv, out, out, inlen, block);
+        if (padded_len != ret) {
+            OPENSSL_cleanse(out, inlen);
+            return 0;
+        }
+    }
+
+    /*
+     * Section 3: AIV checks: Check that MSB(32,A) = A65959A6. Optionally a
+     * user-supplied value can be used (even if standard doesn't mention
+     * this).
+     */
+    if ((!icv && CRYPTO_memcmp(aiv, default_aiv, 4))
+        || (icv && CRYPTO_memcmp(aiv, icv, 4))) {
        OPENSSL_cleanse(out, inlen);
        return 0;
    }
-    return inlen;
+
+    /*
+     * Check that 8*(n-1) < LSB(32,AIV) <= 8*n. If so, let ptext_len =
+     * LSB(32,AIV).
+     */
+
+    ptext_len =   ((unsigned int)aiv[4] << 24)
+                | ((unsigned int)aiv[5] << 16)
+                | ((unsigned int)aiv[6] <<  8)
+                |  (unsigned int)aiv[7];
+    if (8 * (n - 1) >= ptext_len || ptext_len > 8 * n) {
+        OPENSSL_cleanse(out, inlen);
+        return 0;
+    }
+
+    /*
+     * Check that the rightmost padding_len octets of the output data are
+     * zero.
+     */
+    padding_len = padded_len - ptext_len;
+    if (CRYPTO_memcmp(out + ptext_len, zeros, padding_len) != 0) {
+        OPENSSL_cleanse(out, inlen);
+        return 0;
+    }
+
+    /* Section 4.2 step 3: Remove padding */
+    return ptext_len;
 }
--- a/crypto/modes/xts128.c
+++ b/crypto/modes/xts128.c
@@ -1,63 +1,16 @@
-/* ====================================================================
- * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+/*
+ * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
 *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
 */

 #include <openssl/crypto.h>
 #include "modes_lcl.h"
 #include <string.h>

-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
 int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
                          const unsigned char iv[16],
                          const unsigned char *inp, unsigned char *out,