Blame - ap/lib/libssl/openssl-1.1.1o/crypto/modes/gcm128.c - T106_DC

blob: 8304efff48be038cc297046232cd327910786a1f [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/*
				2	* Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
				3	*
				4	* Licensed under the OpenSSL license (the "License"). You may not use
				5	* this file except in compliance with the License. You can obtain a copy
				6	* in the file LICENSE in the source distribution or at
				7	* https://www.openssl.org/source/license.html
				8	*/
				9
				10	#include <openssl/crypto.h>
				11	#include "modes_local.h"
				12	#include <string.h>
				13
				14	#if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
				15	typedef size_t size_t_aX __attribute((__aligned__(1)));
				16	#else
				17	typedef size_t size_t_aX;
				18	#endif
				19
				20	#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
				21	/* redefine, because alignment is ensured */
				22	# undef GETU32
				23	# define GETU32(p) BSWAP4((const u32 )(p))
				24	# undef PUTU32
				25	# define PUTU32(p,v) (u32 )(p) = BSWAP4(v)
				26	#endif
				27
				28	#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
				29	#define REDUCE1BIT(V) do { \
				30	if (sizeof(size_t)==8) { \
				31	u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
				32	V.lo = (V.hi<<63)\|(V.lo>>1); \
				33	V.hi = (V.hi>>1 )^T; \
				34	} \
				35	else { \
				36	u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
				37	V.lo = (V.hi<<63)\|(V.lo>>1); \
				38	V.hi = (V.hi>>1 )^((u64)T<<32); \
				39	} \
				40	} while(0)
				41
				42	/*-
				43	* Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
				44	* never be set to 8. 8 is effectively reserved for testing purposes.
				45	* TABLE_BITS>1 are lookup-table-driven implementations referred to as
				46	* "Shoup's" in GCM specification. In other words OpenSSL does not cover
				47	* whole spectrum of possible table driven implementations. Why? In
				48	* non-"Shoup's" case memory access pattern is segmented in such manner,
				49	* that it's trivial to see that cache timing information can reveal
				50	* fair portion of intermediate hash value. Given that ciphertext is
				51	* always available to attacker, it's possible for him to attempt to
				52	* deduce secret parameter H and if successful, tamper with messages
				53	* [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
				54	* not as trivial, but there is no reason to believe that it's resistant
				55	* to cache-timing attack. And the thing about "8-bit" implementation is
				56	* that it consumes 16 (sixteen) times more memory, 4KB per individual
				57	* key + 1KB shared. Well, on pros side it should be twice as fast as
				58	* "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
				59	* was observed to run ~75% faster, closer to 100% for commercial
				60	* compilers... Yet "4-bit" procedure is preferred, because it's
				61	* believed to provide better security-performance balance and adequate
				62	* all-round performance. "All-round" refers to things like:
				63	*
				64	* - shorter setup time effectively improves overall timing for
				65	* handling short messages;
				66	* - larger table allocation can become unbearable because of VM
				67	* subsystem penalties (for example on Windows large enough free
				68	* results in VM working set trimming, meaning that consequent
				69	* malloc would immediately incur working set expansion);
				70	* - larger table has larger cache footprint, which can affect
				71	* performance of other code paths (not necessarily even from same
				72	* thread in Hyper-Threading world);
				73	*
				74	* Value of 1 is not appropriate for performance reasons.
				75	*/
				76	#if TABLE_BITS==8
				77
				78	static void gcm_init_8bit(u128 Htable[256], u64 H[2])
				79	{
				80	int i, j;
				81	u128 V;
				82
				83	Htable[0].hi = 0;
				84	Htable[0].lo = 0;
				85	V.hi = H[0];
				86	V.lo = H[1];
				87
				88	for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
				89	REDUCE1BIT(V);
				90	Htable[i] = V;
				91	}
				92
				93	for (i = 2; i < 256; i <<= 1) {
				94	u128 Hi = Htable + i, H0 = Hi;
				95	for (j = 1; j < i; ++j) {
				96	Hi[j].hi = H0.hi ^ Htable[j].hi;
				97	Hi[j].lo = H0.lo ^ Htable[j].lo;
				98	}
				99	}
				100	}
				101
				102	static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
				103	{
				104	u128 Z = { 0, 0 };
				105	const u8 xi = (const u8 )Xi + 15;
				106	size_t rem, n = *xi;
				107	const union {
				108	long one;
				109	char little;
				110	} is_endian = { 1 };
				111	static const size_t rem_8bit[256] = {
				112	PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
				113	PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
				114	PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
				115	PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
				116	PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
				117	PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
				118	PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
				119	PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
				120	PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
				121	PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
				122	PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
				123	PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
				124	PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
				125	PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
				126	PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
				127	PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
				128	PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
				129	PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
				130	PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
				131	PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
				132	PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
				133	PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
				134	PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
				135	PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
				136	PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
				137	PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
				138	PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
				139	PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
				140	PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
				141	PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
				142	PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
				143	PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
				144	PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
				145	PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
				146	PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
				147	PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
				148	PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
				149	PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
				150	PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
				151	PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
				152	PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
				153	PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
				154	PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
				155	PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
				156	PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
				157	PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
				158	PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
				159	PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
				160	PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
				161	PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
				162	PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
				163	PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
				164	PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
				165	PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
				166	PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
				167	PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
				168	PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
				169	PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
				170	PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
				171	PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
				172	PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
				173	PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
				174	PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
				175	PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
				176	};
				177
				178	while (1) {
				179	Z.hi ^= Htable[n].hi;
				180	Z.lo ^= Htable[n].lo;
				181
				182	if ((u8 *)Xi == xi)
				183	break;
				184
				185	n = *(--xi);
				186
				187	rem = (size_t)Z.lo & 0xff;
				188	Z.lo = (Z.hi << 56) \| (Z.lo >> 8);
				189	Z.hi = (Z.hi >> 8);
				190	if (sizeof(size_t) == 8)
				191	Z.hi ^= rem_8bit[rem];
				192	else
				193	Z.hi ^= (u64)rem_8bit[rem] << 32;
				194	}
				195
				196	if (is_endian.little) {
				197	# ifdef BSWAP8
				198	Xi[0] = BSWAP8(Z.hi);
				199	Xi[1] = BSWAP8(Z.lo);
				200	# else
				201	u8 p = (u8 )Xi;
				202	u32 v;
				203	v = (u32)(Z.hi >> 32);
				204	PUTU32(p, v);
				205	v = (u32)(Z.hi);
				206	PUTU32(p + 4, v);
				207	v = (u32)(Z.lo >> 32);
				208	PUTU32(p + 8, v);
				209	v = (u32)(Z.lo);
				210	PUTU32(p + 12, v);
				211	# endif
				212	} else {
				213	Xi[0] = Z.hi;
				214	Xi[1] = Z.lo;
				215	}
				216	}
				217
				218	# define GCM_MUL(ctx) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
				219
				220	#elif TABLE_BITS==4
				221
				222	static void gcm_init_4bit(u128 Htable[16], u64 H[2])
				223	{
				224	u128 V;
				225	# if defined(OPENSSL_SMALL_FOOTPRINT)
				226	int i;
				227	# endif
				228
				229	Htable[0].hi = 0;
				230	Htable[0].lo = 0;
				231	V.hi = H[0];
				232	V.lo = H[1];
				233
				234	# if defined(OPENSSL_SMALL_FOOTPRINT)
				235	for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
				236	REDUCE1BIT(V);
				237	Htable[i] = V;
				238	}
				239
				240	for (i = 2; i < 16; i <<= 1) {
				241	u128 *Hi = Htable + i;
				242	int j;
				243	for (V = *Hi, j = 1; j < i; ++j) {
				244	Hi[j].hi = V.hi ^ Htable[j].hi;
				245	Hi[j].lo = V.lo ^ Htable[j].lo;
				246	}
				247	}
				248	# else
				249	Htable[8] = V;
				250	REDUCE1BIT(V);
				251	Htable[4] = V;
				252	REDUCE1BIT(V);
				253	Htable[2] = V;
				254	REDUCE1BIT(V);
				255	Htable[1] = V;
				256	Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
				257	V = Htable[4];
				258	Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
				259	Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
				260	Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
				261	V = Htable[8];
				262	Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
				263	Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
				264	Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
				265	Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
				266	Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
				267	Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
				268	Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
				269	# endif
				270	# if defined(GHASH_ASM) && (defined(__arm__) \|\| defined(__arm))
				271	/*
				272	* ARM assembler expects specific dword order in Htable.
				273	*/
				274	{
				275	int j;
				276	const union {
				277	long one;
				278	char little;
				279	} is_endian = { 1 };
				280
				281	if (is_endian.little)
				282	for (j = 0; j < 16; ++j) {
				283	V = Htable[j];
				284	Htable[j].hi = V.lo;
				285	Htable[j].lo = V.hi;
				286	} else
				287	for (j = 0; j < 16; ++j) {
				288	V = Htable[j];
				289	Htable[j].hi = V.lo << 32 \| V.lo >> 32;
				290	Htable[j].lo = V.hi << 32 \| V.hi >> 32;
				291	}
				292	}
				293	# endif
				294	}
				295
				296	# ifndef GHASH_ASM
				297	static const size_t rem_4bit[16] = {
				298	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
				299	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
				300	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
				301	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
				302	};
				303
				304	static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
				305	{
				306	u128 Z;
				307	int cnt = 15;
				308	size_t rem, nlo, nhi;
				309	const union {
				310	long one;
				311	char little;
				312	} is_endian = { 1 };
				313
				314	nlo = ((const u8 *)Xi)[15];
				315	nhi = nlo >> 4;
				316	nlo &= 0xf;
				317
				318	Z.hi = Htable[nlo].hi;
				319	Z.lo = Htable[nlo].lo;
				320
				321	while (1) {
				322	rem = (size_t)Z.lo & 0xf;
				323	Z.lo = (Z.hi << 60) \| (Z.lo >> 4);
				324	Z.hi = (Z.hi >> 4);
				325	if (sizeof(size_t) == 8)
				326	Z.hi ^= rem_4bit[rem];
				327	else
				328	Z.hi ^= (u64)rem_4bit[rem] << 32;
				329
				330	Z.hi ^= Htable[nhi].hi;
				331	Z.lo ^= Htable[nhi].lo;
				332
				333	if (--cnt < 0)
				334	break;
				335
				336	nlo = ((const u8 *)Xi)[cnt];
				337	nhi = nlo >> 4;
				338	nlo &= 0xf;
				339
				340	rem = (size_t)Z.lo & 0xf;
				341	Z.lo = (Z.hi << 60) \| (Z.lo >> 4);
				342	Z.hi = (Z.hi >> 4);
				343	if (sizeof(size_t) == 8)
				344	Z.hi ^= rem_4bit[rem];
				345	else
				346	Z.hi ^= (u64)rem_4bit[rem] << 32;
				347
				348	Z.hi ^= Htable[nlo].hi;
				349	Z.lo ^= Htable[nlo].lo;
				350	}
				351
				352	if (is_endian.little) {
				353	# ifdef BSWAP8
				354	Xi[0] = BSWAP8(Z.hi);
				355	Xi[1] = BSWAP8(Z.lo);
				356	# else
				357	u8 p = (u8 )Xi;
				358	u32 v;
				359	v = (u32)(Z.hi >> 32);
				360	PUTU32(p, v);
				361	v = (u32)(Z.hi);
				362	PUTU32(p + 4, v);
				363	v = (u32)(Z.lo >> 32);
				364	PUTU32(p + 8, v);
				365	v = (u32)(Z.lo);
				366	PUTU32(p + 12, v);
				367	# endif
				368	} else {
				369	Xi[0] = Z.hi;
				370	Xi[1] = Z.lo;
				371	}
				372	}
				373
				374	# if !defined(OPENSSL_SMALL_FOOTPRINT)
				375	/*
				376	* Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en\|de]crypt for
				377	* details... Compiler-generated code doesn't seem to give any
				378	* performance improvement, at least not on x86[_64]. It's here
				379	* mostly as reference and a placeholder for possible future
				380	* non-trivial optimization[s]...
				381	*/
				382	static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
				383	const u8 *inp, size_t len)
				384	{
				385	u128 Z;
				386	int cnt;
				387	size_t rem, nlo, nhi;
				388	const union {
				389	long one;
				390	char little;
				391	} is_endian = { 1 };
				392
				393	# if 1
				394	do {
				395	cnt = 15;
				396	nlo = ((const u8 *)Xi)[15];
				397	nlo ^= inp[15];
				398	nhi = nlo >> 4;
				399	nlo &= 0xf;
				400
				401	Z.hi = Htable[nlo].hi;
				402	Z.lo = Htable[nlo].lo;
				403
				404	while (1) {
				405	rem = (size_t)Z.lo & 0xf;
				406	Z.lo = (Z.hi << 60) \| (Z.lo >> 4);
				407	Z.hi = (Z.hi >> 4);
				408	if (sizeof(size_t) == 8)
				409	Z.hi ^= rem_4bit[rem];
				410	else
				411	Z.hi ^= (u64)rem_4bit[rem] << 32;
				412
				413	Z.hi ^= Htable[nhi].hi;
				414	Z.lo ^= Htable[nhi].lo;
				415
				416	if (--cnt < 0)
				417	break;
				418
				419	nlo = ((const u8 *)Xi)[cnt];
				420	nlo ^= inp[cnt];
				421	nhi = nlo >> 4;
				422	nlo &= 0xf;
				423
				424	rem = (size_t)Z.lo & 0xf;
				425	Z.lo = (Z.hi << 60) \| (Z.lo >> 4);
				426	Z.hi = (Z.hi >> 4);
				427	if (sizeof(size_t) == 8)
				428	Z.hi ^= rem_4bit[rem];
				429	else
				430	Z.hi ^= (u64)rem_4bit[rem] << 32;
				431
				432	Z.hi ^= Htable[nlo].hi;
				433	Z.lo ^= Htable[nlo].lo;
				434	}
				435	# else
				436	/*
				437	* Extra 256+16 bytes per-key plus 512 bytes shared tables
				438	* [should] give ~50% improvement... One could have PACK()-ed
				439	* the rem_8bit even here, but the priority is to minimize
				440	* cache footprint...
				441	*/
				442	u128 Hshr4[16]; /* Htable shifted right by 4 bits */
				443	u8 Hshl4[16]; /* Htable shifted left by 4 bits */
				444	static const unsigned short rem_8bit[256] = {
				445	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
				446	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
				447	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
				448	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
				449	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
				450	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
				451	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
				452	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
				453	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
				454	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
				455	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
				456	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
				457	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
				458	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
				459	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
				460	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
				461	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
				462	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
				463	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
				464	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
				465	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
				466	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
				467	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
				468	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
				469	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
				470	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
				471	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
				472	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
				473	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
				474	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
				475	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
				476	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
				477	};
				478	/*
				479	* This pre-processing phase slows down procedure by approximately
				480	* same time as it makes each loop spin faster. In other words
				481	* single block performance is approximately same as straightforward
				482	* "4-bit" implementation, and then it goes only faster...
				483	*/
				484	for (cnt = 0; cnt < 16; ++cnt) {
				485	Z.hi = Htable[cnt].hi;
				486	Z.lo = Htable[cnt].lo;
				487	Hshr4[cnt].lo = (Z.hi << 60) \| (Z.lo >> 4);
				488	Hshr4[cnt].hi = (Z.hi >> 4);
				489	Hshl4[cnt] = (u8)(Z.lo << 4);
				490	}
				491
				492	do {
				493	for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
				494	nlo = ((const u8 *)Xi)[cnt];
				495	nlo ^= inp[cnt];
				496	nhi = nlo >> 4;
				497	nlo &= 0xf;
				498
				499	Z.hi ^= Htable[nlo].hi;
				500	Z.lo ^= Htable[nlo].lo;
				501
				502	rem = (size_t)Z.lo & 0xff;
				503
				504	Z.lo = (Z.hi << 56) \| (Z.lo >> 8);
				505	Z.hi = (Z.hi >> 8);
				506
				507	Z.hi ^= Hshr4[nhi].hi;
				508	Z.lo ^= Hshr4[nhi].lo;
				509	Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
				510	}
				511
				512	nlo = ((const u8 *)Xi)[0];
				513	nlo ^= inp[0];
				514	nhi = nlo >> 4;
				515	nlo &= 0xf;
				516
				517	Z.hi ^= Htable[nlo].hi;
				518	Z.lo ^= Htable[nlo].lo;
				519
				520	rem = (size_t)Z.lo & 0xf;
				521
				522	Z.lo = (Z.hi << 60) \| (Z.lo >> 4);
				523	Z.hi = (Z.hi >> 4);
				524
				525	Z.hi ^= Htable[nhi].hi;
				526	Z.lo ^= Htable[nhi].lo;
				527	Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
				528	# endif
				529
				530	if (is_endian.little) {
				531	# ifdef BSWAP8
				532	Xi[0] = BSWAP8(Z.hi);
				533	Xi[1] = BSWAP8(Z.lo);
				534	# else
				535	u8 p = (u8 )Xi;
				536	u32 v;
				537	v = (u32)(Z.hi >> 32);
				538	PUTU32(p, v);
				539	v = (u32)(Z.hi);
				540	PUTU32(p + 4, v);
				541	v = (u32)(Z.lo >> 32);
				542	PUTU32(p + 8, v);
				543	v = (u32)(Z.lo);
				544	PUTU32(p + 12, v);
				545	# endif
				546	} else {
				547	Xi[0] = Z.hi;
				548	Xi[1] = Z.lo;
				549	}
				550	} while (inp += 16, len -= 16);
				551	}
				552	# endif
				553	# else
				554	void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
				555	void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
				556	size_t len);
				557	# endif
				558
				559	# define GCM_MUL(ctx) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
				560	# if defined(GHASH_ASM) \|\| !defined(OPENSSL_SMALL_FOOTPRINT)
				561	# define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
				562	/*
				563	* GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
				564	* effect. In other words idea is to hash data while it's still in L1 cache
				565	* after encryption pass...
				566	*/
				567	# define GHASH_CHUNK (3*1024)
				568	# endif
				569
				570	#else /* TABLE_BITS */
				571
				572	static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
				573	{
				574	u128 V, Z = { 0, 0 };
				575	long X;
				576	int i, j;
				577	const long xi = (const long )Xi;
				578	const union {
				579	long one;
				580	char little;
				581	} is_endian = { 1 };
				582
				583	V.hi = H[0]; /* H is in host byte order, no byte swapping */
				584	V.lo = H[1];
				585
				586	for (j = 0; j < 16 / sizeof(long); ++j) {
				587	if (is_endian.little) {
				588	if (sizeof(long) == 8) {
				589	# ifdef BSWAP8
				590	X = (long)(BSWAP8(xi[j]));
				591	# else
				592	const u8 p = (const u8 )(xi + j);
				593	X = (long)((u64)GETU32(p) << 32 \| GETU32(p + 4));
				594	# endif
				595	} else {
				596	const u8 p = (const u8 )(xi + j);
				597	X = (long)GETU32(p);
				598	}
				599	} else
				600	X = xi[j];
				601
				602	for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
				603	u64 M = (u64)(X >> (8 * sizeof(long) - 1));
				604	Z.hi ^= V.hi & M;
				605	Z.lo ^= V.lo & M;
				606
				607	REDUCE1BIT(V);
				608	}
				609	}
				610
				611	if (is_endian.little) {
				612	# ifdef BSWAP8
				613	Xi[0] = BSWAP8(Z.hi);
				614	Xi[1] = BSWAP8(Z.lo);
				615	# else
				616	u8 p = (u8 )Xi;
				617	u32 v;
				618	v = (u32)(Z.hi >> 32);
				619	PUTU32(p, v);
				620	v = (u32)(Z.hi);
				621	PUTU32(p + 4, v);
				622	v = (u32)(Z.lo >> 32);
				623	PUTU32(p + 8, v);
				624	v = (u32)(Z.lo);
				625	PUTU32(p + 12, v);
				626	# endif
				627	} else {
				628	Xi[0] = Z.hi;
				629	Xi[1] = Z.lo;
				630	}
				631	}
				632
				633	# define GCM_MUL(ctx) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
				634
				635	#endif
				636
				637	#if TABLE_BITS==4 && (defined(GHASH_ASM) \|\| defined(OPENSSL_CPUID_OBJ))
				638	# if !defined(I386_ONLY) && \
				639	(defined(__i386) \|\| defined(__i386__) \|\| \
				640	defined(__x86_64) \|\| defined(__x86_64__) \|\| \
				641	defined(_M_IX86) \|\| defined(_M_AMD64) \|\| defined(_M_X64))
				642	# define GHASH_ASM_X86_OR_64
				643	# define GCM_FUNCREF_4BIT
				644	extern unsigned int OPENSSL_ia32cap_P[];
				645
				646	void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
				647	void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
				648	void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
				649	size_t len);
				650
				651	# if defined(__i386) \|\| defined(__i386__) \|\| defined(_M_IX86)
				652	# define gcm_init_avx gcm_init_clmul
				653	# define gcm_gmult_avx gcm_gmult_clmul
				654	# define gcm_ghash_avx gcm_ghash_clmul
				655	# else
				656	void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
				657	void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
				658	void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
				659	size_t len);
				660	# endif
				661
				662	# if defined(__i386) \|\| defined(__i386__) \|\| defined(_M_IX86)
				663	# define GHASH_ASM_X86
				664	void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
				665	void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
				666	size_t len);
				667
				668	void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
				669	void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
				670	size_t len);
				671	# endif
				672	# elif defined(__arm__) \|\| defined(__arm) \|\| defined(__aarch64__)
				673	# include "arm_arch.h"
				674	# if __ARM_MAX_ARCH__>=7
				675	# define GHASH_ASM_ARM
				676	# define GCM_FUNCREF_4BIT
				677	# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
				678	# if defined(__arm__) \|\| defined(__arm)
				679	# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
				680	# endif
				681	void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
				682	void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
				683	void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
				684	size_t len);
				685	void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
				686	void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
				687	void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
				688	size_t len);
				689	# endif
				690	# elif defined(__sparc__) \|\| defined(__sparc)
				691	# include "sparc_arch.h"
				692	# define GHASH_ASM_SPARC
				693	# define GCM_FUNCREF_4BIT
				694	extern unsigned int OPENSSL_sparcv9cap_P[];
				695	void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
				696	void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
				697	void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
				698	size_t len);
				699	# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) \|\| defined(__ppc__) \|\| defined(_ARCH_PPC))
				700	# include "ppc_arch.h"
				701	# define GHASH_ASM_PPC
				702	# define GCM_FUNCREF_4BIT
				703	void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
				704	void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
				705	void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
				706	size_t len);
				707	# endif
				708	#endif
				709
				710	#ifdef GCM_FUNCREF_4BIT
				711	# undef GCM_MUL
				712	# define GCM_MUL(ctx) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
				713	# ifdef GHASH
				714	# undef GHASH
				715	# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
				716	# endif
				717	#endif
				718
				719	void CRYPTO_gcm128_init(GCM128_CONTEXT ctx, void key, block128_f block)
				720	{
				721	const union {
				722	long one;
				723	char little;
				724	} is_endian = { 1 };
				725
				726	memset(ctx, 0, sizeof(*ctx));
				727	ctx->block = block;
				728	ctx->key = key;
				729
				730	(*block) (ctx->H.c, ctx->H.c, key);
				731
				732	if (is_endian.little) {
				733	/* H is stored in host byte order */
				734	#ifdef BSWAP8
				735	ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
				736	ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
				737	#else
				738	u8 *p = ctx->H.c;
				739	u64 hi, lo;
				740	hi = (u64)GETU32(p) << 32 \| GETU32(p + 4);
				741	lo = (u64)GETU32(p + 8) << 32 \| GETU32(p + 12);
				742	ctx->H.u[0] = hi;
				743	ctx->H.u[1] = lo;
				744	#endif
				745	}
				746	#if TABLE_BITS==8
				747	gcm_init_8bit(ctx->Htable, ctx->H.u);
				748	#elif TABLE_BITS==4
				749	# if defined(GHASH)
				750	# define CTX__GHASH(f) (ctx->ghash = (f))
				751	# else
				752	# define CTX__GHASH(f) (ctx->ghash = NULL)
				753	# endif
				754	# if defined(GHASH_ASM_X86_OR_64)
				755	# if !defined(GHASH_ASM_X86) \|\| defined(OPENSSL_IA32_SSE2)
				756	if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
				757	if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
				758	gcm_init_avx(ctx->Htable, ctx->H.u);
				759	ctx->gmult = gcm_gmult_avx;
				760	CTX__GHASH(gcm_ghash_avx);
				761	} else {
				762	gcm_init_clmul(ctx->Htable, ctx->H.u);
				763	ctx->gmult = gcm_gmult_clmul;
				764	CTX__GHASH(gcm_ghash_clmul);
				765	}
				766	return;
				767	}
				768	# endif
				769	gcm_init_4bit(ctx->Htable, ctx->H.u);
				770	# if defined(GHASH_ASM_X86) /* x86 only */
				771	# if defined(OPENSSL_IA32_SSE2)
				772	if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
				773	# else
				774	if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
				775	# endif
				776	ctx->gmult = gcm_gmult_4bit_mmx;
				777	CTX__GHASH(gcm_ghash_4bit_mmx);
				778	} else {
				779	ctx->gmult = gcm_gmult_4bit_x86;
				780	CTX__GHASH(gcm_ghash_4bit_x86);
				781	}
				782	# else
				783	ctx->gmult = gcm_gmult_4bit;
				784	CTX__GHASH(gcm_ghash_4bit);
				785	# endif
				786	# elif defined(GHASH_ASM_ARM)
				787	# ifdef PMULL_CAPABLE
				788	if (PMULL_CAPABLE) {
				789	gcm_init_v8(ctx->Htable, ctx->H.u);
				790	ctx->gmult = gcm_gmult_v8;
				791	CTX__GHASH(gcm_ghash_v8);
				792	} else
				793	# endif
				794	# ifdef NEON_CAPABLE
				795	if (NEON_CAPABLE) {
				796	gcm_init_neon(ctx->Htable, ctx->H.u);
				797	ctx->gmult = gcm_gmult_neon;
				798	CTX__GHASH(gcm_ghash_neon);
				799	} else
				800	# endif
				801	{
				802	gcm_init_4bit(ctx->Htable, ctx->H.u);
				803	ctx->gmult = gcm_gmult_4bit;
				804	CTX__GHASH(gcm_ghash_4bit);
				805	}
				806	# elif defined(GHASH_ASM_SPARC)
				807	if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
				808	gcm_init_vis3(ctx->Htable, ctx->H.u);
				809	ctx->gmult = gcm_gmult_vis3;
				810	CTX__GHASH(gcm_ghash_vis3);
				811	} else {
				812	gcm_init_4bit(ctx->Htable, ctx->H.u);
				813	ctx->gmult = gcm_gmult_4bit;
				814	CTX__GHASH(gcm_ghash_4bit);
				815	}
				816	# elif defined(GHASH_ASM_PPC)
				817	if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
				818	gcm_init_p8(ctx->Htable, ctx->H.u);
				819	ctx->gmult = gcm_gmult_p8;
				820	CTX__GHASH(gcm_ghash_p8);
				821	} else {
				822	gcm_init_4bit(ctx->Htable, ctx->H.u);
				823	ctx->gmult = gcm_gmult_4bit;
				824	CTX__GHASH(gcm_ghash_4bit);
				825	}
				826	# else
				827	gcm_init_4bit(ctx->Htable, ctx->H.u);
				828	# endif
				829	# undef CTX__GHASH
				830	#endif
				831	}
				832
				833	void CRYPTO_gcm128_setiv(GCM128_CONTEXT ctx, const unsigned char iv,
				834	size_t len)
				835	{
				836	const union {
				837	long one;
				838	char little;
				839	} is_endian = { 1 };
				840	unsigned int ctr;
				841	#ifdef GCM_FUNCREF_4BIT
				842	void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
				843	#endif
				844
				845	ctx->len.u[0] = 0; /* AAD length */
				846	ctx->len.u[1] = 0; /* message length */
				847	ctx->ares = 0;
				848	ctx->mres = 0;
				849
				850	if (len == 12) {
				851	memcpy(ctx->Yi.c, iv, 12);
				852	ctx->Yi.c[12] = 0;
				853	ctx->Yi.c[13] = 0;
				854	ctx->Yi.c[14] = 0;
				855	ctx->Yi.c[15] = 1;
				856	ctr = 1;
				857	} else {
				858	size_t i;
				859	u64 len0 = len;
				860
				861	/* Borrow ctx->Xi to calculate initial Yi */
				862	ctx->Xi.u[0] = 0;
				863	ctx->Xi.u[1] = 0;
				864
				865	while (len >= 16) {
				866	for (i = 0; i < 16; ++i)
				867	ctx->Xi.c[i] ^= iv[i];
				868	GCM_MUL(ctx);
				869	iv += 16;
				870	len -= 16;
				871	}
				872	if (len) {
				873	for (i = 0; i < len; ++i)
				874	ctx->Xi.c[i] ^= iv[i];
				875	GCM_MUL(ctx);
				876	}
				877	len0 <<= 3;
				878	if (is_endian.little) {
				879	#ifdef BSWAP8
				880	ctx->Xi.u[1] ^= BSWAP8(len0);
				881	#else
				882	ctx->Xi.c[8] ^= (u8)(len0 >> 56);
				883	ctx->Xi.c[9] ^= (u8)(len0 >> 48);
				884	ctx->Xi.c[10] ^= (u8)(len0 >> 40);
				885	ctx->Xi.c[11] ^= (u8)(len0 >> 32);
				886	ctx->Xi.c[12] ^= (u8)(len0 >> 24);
				887	ctx->Xi.c[13] ^= (u8)(len0 >> 16);
				888	ctx->Xi.c[14] ^= (u8)(len0 >> 8);
				889	ctx->Xi.c[15] ^= (u8)(len0);
				890	#endif
				891	} else {
				892	ctx->Xi.u[1] ^= len0;
				893	}
				894
				895	GCM_MUL(ctx);
				896
				897	if (is_endian.little)
				898	#ifdef BSWAP4
				899	ctr = BSWAP4(ctx->Xi.d[3]);
				900	#else
				901	ctr = GETU32(ctx->Xi.c + 12);
				902	#endif
				903	else
				904	ctr = ctx->Xi.d[3];
				905
				906	/* Copy borrowed Xi to Yi */
				907	ctx->Yi.u[0] = ctx->Xi.u[0];
				908	ctx->Yi.u[1] = ctx->Xi.u[1];
				909	}
				910
				911	ctx->Xi.u[0] = 0;
				912	ctx->Xi.u[1] = 0;
				913
				914	(*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
				915	++ctr;
				916	if (is_endian.little)
				917	#ifdef BSWAP4
				918	ctx->Yi.d[3] = BSWAP4(ctr);
				919	#else
				920	PUTU32(ctx->Yi.c + 12, ctr);
				921	#endif
				922	else
				923	ctx->Yi.d[3] = ctr;
				924	}
				925
				926	int CRYPTO_gcm128_aad(GCM128_CONTEXT ctx, const unsigned char aad,
				927	size_t len)
				928	{
				929	size_t i;
				930	unsigned int n;
				931	u64 alen = ctx->len.u[0];
				932	#ifdef GCM_FUNCREF_4BIT
				933	void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
				934	# ifdef GHASH
				935	void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
				936	const u8 *inp, size_t len) = ctx->ghash;
				937	# endif
				938	#endif
				939
				940	if (ctx->len.u[1])
				941	return -2;
				942
				943	alen += len;
				944	if (alen > (U64(1) << 61) \|\| (sizeof(len) == 8 && alen < len))
				945	return -1;
				946	ctx->len.u[0] = alen;
				947
				948	n = ctx->ares;
				949	if (n) {
				950	while (n && len) {
				951	ctx->Xi.c[n] ^= *(aad++);
				952	--len;
				953	n = (n + 1) % 16;
				954	}
				955	if (n == 0)
				956	GCM_MUL(ctx);
				957	else {
				958	ctx->ares = n;
				959	return 0;
				960	}
				961	}
				962	#ifdef GHASH
				963	if ((i = (len & (size_t)-16))) {
				964	GHASH(ctx, aad, i);
				965	aad += i;
				966	len -= i;
				967	}
				968	#else
				969	while (len >= 16) {
				970	for (i = 0; i < 16; ++i)
				971	ctx->Xi.c[i] ^= aad[i];
				972	GCM_MUL(ctx);
				973	aad += 16;
				974	len -= 16;
				975	}
				976	#endif
				977	if (len) {
				978	n = (unsigned int)len;
				979	for (i = 0; i < len; ++i)
				980	ctx->Xi.c[i] ^= aad[i];
				981	}
				982
				983	ctx->ares = n;
				984	return 0;
				985	}
				986
				987	int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
				988	const unsigned char in, unsigned char out,
				989	size_t len)
				990	{
				991	const union {
				992	long one;
				993	char little;
				994	} is_endian = { 1 };
				995	unsigned int n, ctr, mres;
				996	size_t i;
				997	u64 mlen = ctx->len.u[1];
				998	block128_f block = ctx->block;
				999	void *key = ctx->key;
				1000	#ifdef GCM_FUNCREF_4BIT
				1001	void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
				1002	# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
				1003	void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
				1004	const u8 *inp, size_t len) = ctx->ghash;
				1005	# endif
				1006	#endif
				1007
				1008	mlen += len;
				1009	if (mlen > ((U64(1) << 36) - 32) \|\| (sizeof(len) == 8 && mlen < len))
				1010	return -1;
				1011	ctx->len.u[1] = mlen;
				1012
				1013	mres = ctx->mres;
				1014
				1015	if (ctx->ares) {
				1016	/* First call to encrypt finalizes GHASH(AAD) */
				1017	#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
				1018	if (len == 0) {
				1019	GCM_MUL(ctx);
				1020	ctx->ares = 0;
				1021	return 0;
				1022	}
				1023	memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
				1024	ctx->Xi.u[0] = 0;
				1025	ctx->Xi.u[1] = 0;
				1026	mres = sizeof(ctx->Xi);
				1027	#else
				1028	GCM_MUL(ctx);
				1029	#endif
				1030	ctx->ares = 0;
				1031	}
				1032
				1033	if (is_endian.little)
				1034	#ifdef BSWAP4
				1035	ctr = BSWAP4(ctx->Yi.d[3]);
				1036	#else
				1037	ctr = GETU32(ctx->Yi.c + 12);
				1038	#endif
				1039	else
				1040	ctr = ctx->Yi.d[3];
				1041
				1042	n = mres % 16;
				1043	#if !defined(OPENSSL_SMALL_FOOTPRINT)
				1044	if (16 % sizeof(size_t) == 0) { /* always true actually */
				1045	do {
				1046	if (n) {
				1047	# if defined(GHASH)
				1048	while (n && len) {
				1049	ctx->Xn[mres++] = (out++) = (in++) ^ ctx->EKi.c[n];
				1050	--len;
				1051	n = (n + 1) % 16;
				1052	}
				1053	if (n == 0) {
				1054	GHASH(ctx, ctx->Xn, mres);
				1055	mres = 0;
				1056	} else {
				1057	ctx->mres = mres;
				1058	return 0;
				1059	}
				1060	# else
				1061	while (n && len) {
				1062	ctx->Xi.c[n] ^= (out++) = (in++) ^ ctx->EKi.c[n];
				1063	--len;
				1064	n = (n + 1) % 16;
				1065	}
				1066	if (n == 0) {
				1067	GCM_MUL(ctx);
				1068	mres = 0;
				1069	} else {
				1070	ctx->mres = n;
				1071	return 0;
				1072	}
				1073	# endif
				1074	}
				1075	# if defined(STRICT_ALIGNMENT)
				1076	if (((size_t)in \| (size_t)out) % sizeof(size_t) != 0)
				1077	break;
				1078	# endif
				1079	# if defined(GHASH)
				1080	if (len >= 16 && mres) {
				1081	GHASH(ctx, ctx->Xn, mres);
				1082	mres = 0;
				1083	}
				1084	# if defined(GHASH_CHUNK)
				1085	while (len >= GHASH_CHUNK) {
				1086	size_t j = GHASH_CHUNK;
				1087
				1088	while (j) {
				1089	size_t_aX out_t = (size_t_aX )out;
				1090	const size_t_aX in_t = (const size_t_aX )in;
				1091
				1092	(*block) (ctx->Yi.c, ctx->EKi.c, key);
				1093	++ctr;
				1094	if (is_endian.little)
				1095	# ifdef BSWAP4
				1096	ctx->Yi.d[3] = BSWAP4(ctr);
				1097	# else
				1098	PUTU32(ctx->Yi.c + 12, ctr);
				1099	# endif
				1100	else
				1101	ctx->Yi.d[3] = ctr;
				1102	for (i = 0; i < 16 / sizeof(size_t); ++i)
				1103	out_t[i] = in_t[i] ^ ctx->EKi.t[i];
				1104	out += 16;
				1105	in += 16;
				1106	j -= 16;
				1107	}
				1108	GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
				1109	len -= GHASH_CHUNK;
				1110	}
				1111	# endif
				1112	if ((i = (len & (size_t)-16))) {
				1113	size_t j = i;
				1114
				1115	while (len >= 16) {
				1116	size_t_aX out_t = (size_t_aX )out;
				1117	const size_t_aX in_t = (const size_t_aX )in;
				1118
				1119	(*block) (ctx->Yi.c, ctx->EKi.c, key);
				1120	++ctr;
				1121	if (is_endian.little)
				1122	# ifdef BSWAP4
				1123	ctx->Yi.d[3] = BSWAP4(ctr);
				1124	# else
				1125	PUTU32(ctx->Yi.c + 12, ctr);
				1126	# endif
				1127	else
				1128	ctx->Yi.d[3] = ctr;
				1129	for (i = 0; i < 16 / sizeof(size_t); ++i)
				1130	out_t[i] = in_t[i] ^ ctx->EKi.t[i];
				1131	out += 16;
				1132	in += 16;
				1133	len -= 16;
				1134	}
				1135	GHASH(ctx, out - j, j);
				1136	}
				1137	# else
				1138	while (len >= 16) {
				1139	size_t out_t = (size_t )out;
				1140	const size_t in_t = (const size_t )in;
				1141
				1142	(*block) (ctx->Yi.c, ctx->EKi.c, key);
				1143	++ctr;
				1144	if (is_endian.little)
				1145	# ifdef BSWAP4
				1146	ctx->Yi.d[3] = BSWAP4(ctr);
				1147	# else
				1148	PUTU32(ctx->Yi.c + 12, ctr);
				1149	# endif
				1150	else
				1151	ctx->Yi.d[3] = ctr;
				1152	for (i = 0; i < 16 / sizeof(size_t); ++i)
				1153	ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
				1154	GCM_MUL(ctx);
				1155	out += 16;
				1156	in += 16;
				1157	len -= 16;
				1158	}
				1159	# endif
				1160	if (len) {
				1161	(*block) (ctx->Yi.c, ctx->EKi.c, key);
				1162	++ctr;
				1163	if (is_endian.little)
				1164	# ifdef BSWAP4
				1165	ctx->Yi.d[3] = BSWAP4(ctr);
				1166	# else
				1167	PUTU32(ctx->Yi.c + 12, ctr);
				1168	# endif
				1169	else
				1170	ctx->Yi.d[3] = ctr;
				1171	# if defined(GHASH)
				1172	while (len--) {
				1173	ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
				1174	++n;
				1175	}
				1176	# else
				1177	while (len--) {
				1178	ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
				1179	++n;
				1180	}
				1181	mres = n;
				1182	# endif
				1183	}
				1184
				1185	ctx->mres = mres;
				1186	return 0;
				1187	} while (0);
				1188	}
				1189	#endif
				1190	for (i = 0; i < len; ++i) {
				1191	if (n == 0) {
				1192	(*block) (ctx->Yi.c, ctx->EKi.c, key);
				1193	++ctr;
				1194	if (is_endian.little)
				1195	#ifdef BSWAP4
				1196	ctx->Yi.d[3] = BSWAP4(ctr);
				1197	#else
				1198	PUTU32(ctx->Yi.c + 12, ctr);
				1199	#endif
				1200	else
				1201	ctx->Yi.d[3] = ctr;
				1202	}
				1203	#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
				1204	ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
				1205	n = (n + 1) % 16;
				1206	if (mres == sizeof(ctx->Xn)) {
				1207	GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
				1208	mres = 0;
				1209	}
				1210	#else
				1211	ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
				1212	mres = n = (n + 1) % 16;
				1213	if (n == 0)
				1214	GCM_MUL(ctx);
				1215	#endif
				1216	}
				1217
				1218	ctx->mres = mres;
				1219	return 0;
				1220	}
				1221
				1222	int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
				1223	const unsigned char in, unsigned char out,
				1224	size_t len)
				1225	{
				1226	const union {
				1227	long one;
				1228	char little;
				1229	} is_endian = { 1 };
				1230	unsigned int n, ctr, mres;
				1231	size_t i;
				1232	u64 mlen = ctx->len.u[1];
				1233	block128_f block = ctx->block;
				1234	void *key = ctx->key;
				1235	#ifdef GCM_FUNCREF_4BIT
				1236	void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
				1237	# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
				1238	void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
				1239	const u8 *inp, size_t len) = ctx->ghash;
				1240	# endif
				1241	#endif
				1242
				1243	mlen += len;
				1244	if (mlen > ((U64(1) << 36) - 32) \|\| (sizeof(len) == 8 && mlen < len))
				1245	return -1;
				1246	ctx->len.u[1] = mlen;
				1247
				1248	mres = ctx->mres;
				1249
				1250	if (ctx->ares) {
				1251	/* First call to decrypt finalizes GHASH(AAD) */
				1252	#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
				1253	if (len == 0) {
				1254	GCM_MUL(ctx);
				1255	ctx->ares = 0;
				1256	return 0;
				1257	}
				1258	memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
				1259	ctx->Xi.u[0] = 0;
				1260	ctx->Xi.u[1] = 0;
				1261	mres = sizeof(ctx->Xi);
				1262	#else
				1263	GCM_MUL(ctx);
				1264	#endif
				1265	ctx->ares = 0;
				1266	}
				1267
				1268	if (is_endian.little)
				1269	#ifdef BSWAP4
				1270	ctr = BSWAP4(ctx->Yi.d[3]);
				1271	#else
				1272	ctr = GETU32(ctx->Yi.c + 12);
				1273	#endif
				1274	else
				1275	ctr = ctx->Yi.d[3];
				1276
				1277	n = mres % 16;
				1278	#if !defined(OPENSSL_SMALL_FOOTPRINT)
				1279	if (16 % sizeof(size_t) == 0) { /* always true actually */
				1280	do {
				1281	if (n) {
				1282	# if defined(GHASH)
				1283	while (n && len) {
				1284	(out++) = (ctx->Xn[mres++] = (in++)) ^ ctx->EKi.c[n];
				1285	--len;
				1286	n = (n + 1) % 16;
				1287	}
				1288	if (n == 0) {
				1289	GHASH(ctx, ctx->Xn, mres);
				1290	mres = 0;
				1291	} else {
				1292	ctx->mres = mres;
				1293	return 0;
				1294	}
				1295	# else
				1296	while (n && len) {
				1297	u8 c = *(in++);
				1298	*(out++) = c ^ ctx->EKi.c[n];
				1299	ctx->Xi.c[n] ^= c;
				1300	--len;
				1301	n = (n + 1) % 16;
				1302	}
				1303	if (n == 0) {
				1304	GCM_MUL(ctx);
				1305	mres = 0;
				1306	} else {
				1307	ctx->mres = n;
				1308	return 0;
				1309	}
				1310	# endif
				1311	}
				1312	# if defined(STRICT_ALIGNMENT)
				1313	if (((size_t)in \| (size_t)out) % sizeof(size_t) != 0)
				1314	break;
				1315	# endif
				1316	# if defined(GHASH)
				1317	if (len >= 16 && mres) {
				1318	GHASH(ctx, ctx->Xn, mres);
				1319	mres = 0;
				1320	}
				1321	# if defined(GHASH_CHUNK)
				1322	while (len >= GHASH_CHUNK) {
				1323	size_t j = GHASH_CHUNK;
				1324
				1325	GHASH(ctx, in, GHASH_CHUNK);
				1326	while (j) {
				1327	size_t_aX out_t = (size_t_aX )out;
				1328	const size_t_aX in_t = (const size_t_aX )in;
				1329
				1330	(*block) (ctx->Yi.c, ctx->EKi.c, key);
				1331	++ctr;
				1332	if (is_endian.little)
				1333	# ifdef BSWAP4
				1334	ctx->Yi.d[3] = BSWAP4(ctr);
				1335	# else
				1336	PUTU32(ctx->Yi.c + 12, ctr);
				1337	# endif
				1338	else
				1339	ctx->Yi.d[3] = ctr;
				1340	for (i = 0; i < 16 / sizeof(size_t); ++i)
				1341	out_t[i] = in_t[i] ^ ctx->EKi.t[i];
				1342	out += 16;
				1343	in += 16;
				1344	j -= 16;
				1345	}
				1346	len -= GHASH_CHUNK;
				1347	}
				1348	# endif
				1349	if ((i = (len & (size_t)-16))) {
				1350	GHASH(ctx, in, i);
				1351	while (len >= 16) {
				1352	size_t_aX out_t = (size_t_aX )out;
				1353	const size_t_aX in_t = (const size_t_aX )in;
				1354
				1355	(*block) (ctx->Yi.c, ctx->EKi.c, key);
				1356	++ctr;
				1357	if (is_endian.little)
				1358	# ifdef BSWAP4
				1359	ctx->Yi.d[3] = BSWAP4(ctr);
				1360	# else
				1361	PUTU32(ctx->Yi.c + 12, ctr);
				1362	# endif
				1363	else
				1364	ctx->Yi.d[3] = ctr;
				1365	for (i = 0; i < 16 / sizeof(size_t); ++i)
				1366	out_t[i] = in_t[i] ^ ctx->EKi.t[i];
				1367	out += 16;
				1368	in += 16;
				1369	len -= 16;
				1370	}
				1371	}
				1372	# else
				1373	while (len >= 16) {
				1374	size_t out_t = (size_t )out;
				1375	const size_t in_t = (const size_t )in;
				1376
				1377	(*block) (ctx->Yi.c, ctx->EKi.c, key);
				1378	++ctr;
				1379	if (is_endian.little)
				1380	# ifdef BSWAP4
				1381	ctx->Yi.d[3] = BSWAP4(ctr);
				1382	# else
				1383	PUTU32(ctx->Yi.c + 12, ctr);
				1384	# endif
				1385	else
				1386	ctx->Yi.d[3] = ctr;
				1387	for (i = 0; i < 16 / sizeof(size_t); ++i) {
				1388	size_t c = in_t[i];
				1389	out_t[i] = c ^ ctx->EKi.t[i];
				1390	ctx->Xi.t[i] ^= c;
				1391	}
				1392	GCM_MUL(ctx);
				1393	out += 16;
				1394	in += 16;
				1395	len -= 16;
				1396	}
				1397	# endif
				1398	if (len) {
				1399	(*block) (ctx->Yi.c, ctx->EKi.c, key);
				1400	++ctr;
				1401	if (is_endian.little)
				1402	# ifdef BSWAP4
				1403	ctx->Yi.d[3] = BSWAP4(ctr);
				1404	# else
				1405	PUTU32(ctx->Yi.c + 12, ctr);
				1406	# endif
				1407	else
				1408	ctx->Yi.d[3] = ctr;
				1409	# if defined(GHASH)
				1410	while (len--) {
				1411	out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
				1412	++n;
				1413	}
				1414	# else
				1415	while (len--) {
				1416	u8 c = in[n];
				1417	ctx->Xi.c[n] ^= c;
				1418	out[n] = c ^ ctx->EKi.c[n];
				1419	++n;
				1420	}
				1421	mres = n;
				1422	# endif
				1423	}
				1424
				1425	ctx->mres = mres;
				1426	return 0;
				1427	} while (0);
				1428	}
				1429	#endif
				1430	for (i = 0; i < len; ++i) {
				1431	u8 c;
				1432	if (n == 0) {
				1433	(*block) (ctx->Yi.c, ctx->EKi.c, key);
				1434	++ctr;
				1435	if (is_endian.little)
				1436	#ifdef BSWAP4
				1437	ctx->Yi.d[3] = BSWAP4(ctr);
				1438	#else
				1439	PUTU32(ctx->Yi.c + 12, ctr);
				1440	#endif
				1441	else
				1442	ctx->Yi.d[3] = ctr;
				1443	}
				1444	#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
				1445	out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
				1446	n = (n + 1) % 16;
				1447	if (mres == sizeof(ctx->Xn)) {
				1448	GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
				1449	mres = 0;
				1450	}
				1451	#else
				1452	c = in[i];
				1453	out[i] = c ^ ctx->EKi.c[n];
				1454	ctx->Xi.c[n] ^= c;
				1455	mres = n = (n + 1) % 16;
				1456	if (n == 0)
				1457	GCM_MUL(ctx);
				1458	#endif
				1459	}
				1460
				1461	ctx->mres = mres;
				1462	return 0;
				1463	}
				1464
				1465	int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
				1466	const unsigned char in, unsigned char out,
				1467	size_t len, ctr128_f stream)
				1468	{
				1469	#if defined(OPENSSL_SMALL_FOOTPRINT)
				1470	return CRYPTO_gcm128_encrypt(ctx, in, out, len);
				1471	#else
				1472	const union {
				1473	long one;
				1474	char little;
				1475	} is_endian = { 1 };
				1476	unsigned int n, ctr, mres;
				1477	size_t i;
				1478	u64 mlen = ctx->len.u[1];
				1479	void *key = ctx->key;
				1480	# ifdef GCM_FUNCREF_4BIT
				1481	void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
				1482	# ifdef GHASH
				1483	void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
				1484	const u8 *inp, size_t len) = ctx->ghash;
				1485	# endif
				1486	# endif
				1487
				1488	mlen += len;
				1489	if (mlen > ((U64(1) << 36) - 32) \|\| (sizeof(len) == 8 && mlen < len))
				1490	return -1;
				1491	ctx->len.u[1] = mlen;
				1492
				1493	mres = ctx->mres;
				1494
				1495	if (ctx->ares) {
				1496	/* First call to encrypt finalizes GHASH(AAD) */
				1497	#if defined(GHASH)
				1498	if (len == 0) {
				1499	GCM_MUL(ctx);
				1500	ctx->ares = 0;
				1501	return 0;
				1502	}
				1503	memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
				1504	ctx->Xi.u[0] = 0;
				1505	ctx->Xi.u[1] = 0;
				1506	mres = sizeof(ctx->Xi);
				1507	#else
				1508	GCM_MUL(ctx);
				1509	#endif
				1510	ctx->ares = 0;
				1511	}
				1512
				1513	if (is_endian.little)
				1514	# ifdef BSWAP4
				1515	ctr = BSWAP4(ctx->Yi.d[3]);
				1516	# else
				1517	ctr = GETU32(ctx->Yi.c + 12);
				1518	# endif
				1519	else
				1520	ctr = ctx->Yi.d[3];
				1521
				1522	n = mres % 16;
				1523	if (n) {
				1524	# if defined(GHASH)
				1525	while (n && len) {
				1526	ctx->Xn[mres++] = (out++) = (in++) ^ ctx->EKi.c[n];
				1527	--len;
				1528	n = (n + 1) % 16;
				1529	}
				1530	if (n == 0) {
				1531	GHASH(ctx, ctx->Xn, mres);
				1532	mres = 0;
				1533	} else {
				1534	ctx->mres = mres;
				1535	return 0;
				1536	}
				1537	# else
				1538	while (n && len) {
				1539	ctx->Xi.c[n] ^= (out++) = (in++) ^ ctx->EKi.c[n];
				1540	--len;
				1541	n = (n + 1) % 16;
				1542	}
				1543	if (n == 0) {
				1544	GCM_MUL(ctx);
				1545	mres = 0;
				1546	} else {
				1547	ctx->mres = n;
				1548	return 0;
				1549	}
				1550	# endif
				1551	}
				1552	# if defined(GHASH)
				1553	if (len >= 16 && mres) {
				1554	GHASH(ctx, ctx->Xn, mres);
				1555	mres = 0;
				1556	}
				1557	# if defined(GHASH_CHUNK)
				1558	while (len >= GHASH_CHUNK) {
				1559	(*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
				1560	ctr += GHASH_CHUNK / 16;
				1561	if (is_endian.little)
				1562	# ifdef BSWAP4
				1563	ctx->Yi.d[3] = BSWAP4(ctr);
				1564	# else
				1565	PUTU32(ctx->Yi.c + 12, ctr);
				1566	# endif
				1567	else
				1568	ctx->Yi.d[3] = ctr;
				1569	GHASH(ctx, out, GHASH_CHUNK);
				1570	out += GHASH_CHUNK;
				1571	in += GHASH_CHUNK;
				1572	len -= GHASH_CHUNK;
				1573	}
				1574	# endif
				1575	# endif
				1576	if ((i = (len & (size_t)-16))) {
				1577	size_t j = i / 16;
				1578
				1579	(*stream) (in, out, j, key, ctx->Yi.c);
				1580	ctr += (unsigned int)j;
				1581	if (is_endian.little)
				1582	# ifdef BSWAP4
				1583	ctx->Yi.d[3] = BSWAP4(ctr);
				1584	# else
				1585	PUTU32(ctx->Yi.c + 12, ctr);
				1586	# endif
				1587	else
				1588	ctx->Yi.d[3] = ctr;
				1589	in += i;
				1590	len -= i;
				1591	# if defined(GHASH)
				1592	GHASH(ctx, out, i);
				1593	out += i;
				1594	# else
				1595	while (j--) {
				1596	for (i = 0; i < 16; ++i)
				1597	ctx->Xi.c[i] ^= out[i];
				1598	GCM_MUL(ctx);
				1599	out += 16;
				1600	}
				1601	# endif
				1602	}
				1603	if (len) {
				1604	(*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
				1605	++ctr;
				1606	if (is_endian.little)
				1607	# ifdef BSWAP4
				1608	ctx->Yi.d[3] = BSWAP4(ctr);
				1609	# else
				1610	PUTU32(ctx->Yi.c + 12, ctr);
				1611	# endif
				1612	else
				1613	ctx->Yi.d[3] = ctr;
				1614	while (len--) {
				1615	# if defined(GHASH)
				1616	ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
				1617	# else
				1618	ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
				1619	# endif
				1620	++n;
				1621	}
				1622	}
				1623
				1624	ctx->mres = mres;
				1625	return 0;
				1626	#endif
				1627	}
				1628
				1629	int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
				1630	const unsigned char in, unsigned char out,
				1631	size_t len, ctr128_f stream)
				1632	{
				1633	#if defined(OPENSSL_SMALL_FOOTPRINT)
				1634	return CRYPTO_gcm128_decrypt(ctx, in, out, len);
				1635	#else
				1636	const union {
				1637	long one;
				1638	char little;
				1639	} is_endian = { 1 };
				1640	unsigned int n, ctr, mres;
				1641	size_t i;
				1642	u64 mlen = ctx->len.u[1];
				1643	void *key = ctx->key;
				1644	# ifdef GCM_FUNCREF_4BIT
				1645	void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
				1646	# ifdef GHASH
				1647	void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
				1648	const u8 *inp, size_t len) = ctx->ghash;
				1649	# endif
				1650	# endif
				1651
				1652	mlen += len;
				1653	if (mlen > ((U64(1) << 36) - 32) \|\| (sizeof(len) == 8 && mlen < len))
				1654	return -1;
				1655	ctx->len.u[1] = mlen;
				1656
				1657	mres = ctx->mres;
				1658
				1659	if (ctx->ares) {
				1660	/* First call to decrypt finalizes GHASH(AAD) */
				1661	# if defined(GHASH)
				1662	if (len == 0) {
				1663	GCM_MUL(ctx);
				1664	ctx->ares = 0;
				1665	return 0;
				1666	}
				1667	memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
				1668	ctx->Xi.u[0] = 0;
				1669	ctx->Xi.u[1] = 0;
				1670	mres = sizeof(ctx->Xi);
				1671	# else
				1672	GCM_MUL(ctx);
				1673	# endif
				1674	ctx->ares = 0;
				1675	}
				1676
				1677	if (is_endian.little)
				1678	# ifdef BSWAP4
				1679	ctr = BSWAP4(ctx->Yi.d[3]);
				1680	# else
				1681	ctr = GETU32(ctx->Yi.c + 12);
				1682	# endif
				1683	else
				1684	ctr = ctx->Yi.d[3];
				1685
				1686	n = mres % 16;
				1687	if (n) {
				1688	# if defined(GHASH)
				1689	while (n && len) {
				1690	(out++) = (ctx->Xn[mres++] = (in++)) ^ ctx->EKi.c[n];
				1691	--len;
				1692	n = (n + 1) % 16;
				1693	}
				1694	if (n == 0) {
				1695	GHASH(ctx, ctx->Xn, mres);
				1696	mres = 0;
				1697	} else {
				1698	ctx->mres = mres;
				1699	return 0;
				1700	}
				1701	# else
				1702	while (n && len) {
				1703	u8 c = *(in++);
				1704	*(out++) = c ^ ctx->EKi.c[n];
				1705	ctx->Xi.c[n] ^= c;
				1706	--len;
				1707	n = (n + 1) % 16;
				1708	}
				1709	if (n == 0) {
				1710	GCM_MUL(ctx);
				1711	mres = 0;
				1712	} else {
				1713	ctx->mres = n;
				1714	return 0;
				1715	}
				1716	# endif
				1717	}
				1718	# if defined(GHASH)
				1719	if (len >= 16 && mres) {
				1720	GHASH(ctx, ctx->Xn, mres);
				1721	mres = 0;
				1722	}
				1723	# if defined(GHASH_CHUNK)
				1724	while (len >= GHASH_CHUNK) {
				1725	GHASH(ctx, in, GHASH_CHUNK);
				1726	(*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
				1727	ctr += GHASH_CHUNK / 16;
				1728	if (is_endian.little)
				1729	# ifdef BSWAP4
				1730	ctx->Yi.d[3] = BSWAP4(ctr);
				1731	# else
				1732	PUTU32(ctx->Yi.c + 12, ctr);
				1733	# endif
				1734	else
				1735	ctx->Yi.d[3] = ctr;
				1736	out += GHASH_CHUNK;
				1737	in += GHASH_CHUNK;
				1738	len -= GHASH_CHUNK;
				1739	}
				1740	# endif
				1741	# endif
				1742	if ((i = (len & (size_t)-16))) {
				1743	size_t j = i / 16;
				1744
				1745	# if defined(GHASH)
				1746	GHASH(ctx, in, i);
				1747	# else
				1748	while (j--) {
				1749	size_t k;
				1750	for (k = 0; k < 16; ++k)
				1751	ctx->Xi.c[k] ^= in[k];
				1752	GCM_MUL(ctx);
				1753	in += 16;
				1754	}
				1755	j = i / 16;
				1756	in -= i;
				1757	# endif
				1758	(*stream) (in, out, j, key, ctx->Yi.c);
				1759	ctr += (unsigned int)j;
				1760	if (is_endian.little)
				1761	# ifdef BSWAP4
				1762	ctx->Yi.d[3] = BSWAP4(ctr);
				1763	# else
				1764	PUTU32(ctx->Yi.c + 12, ctr);
				1765	# endif
				1766	else
				1767	ctx->Yi.d[3] = ctr;
				1768	out += i;
				1769	in += i;
				1770	len -= i;
				1771	}
				1772	if (len) {
				1773	(*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
				1774	++ctr;
				1775	if (is_endian.little)
				1776	# ifdef BSWAP4
				1777	ctx->Yi.d[3] = BSWAP4(ctr);
				1778	# else
				1779	PUTU32(ctx->Yi.c + 12, ctr);
				1780	# endif
				1781	else
				1782	ctx->Yi.d[3] = ctr;
				1783	while (len--) {
				1784	# if defined(GHASH)
				1785	out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
				1786	# else
				1787	u8 c = in[n];
				1788	ctx->Xi.c[mres++] ^= c;
				1789	out[n] = c ^ ctx->EKi.c[n];
				1790	# endif
				1791	++n;
				1792	}
				1793	}
				1794
				1795	ctx->mres = mres;
				1796	return 0;
				1797	#endif
				1798	}
				1799
				1800	int CRYPTO_gcm128_finish(GCM128_CONTEXT ctx, const unsigned char tag,
				1801	size_t len)
				1802	{
				1803	const union {
				1804	long one;
				1805	char little;
				1806	} is_endian = { 1 };
				1807	u64 alen = ctx->len.u[0] << 3;
				1808	u64 clen = ctx->len.u[1] << 3;
				1809	#ifdef GCM_FUNCREF_4BIT
				1810	void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
				1811	# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
				1812	void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
				1813	const u8 *inp, size_t len) = ctx->ghash;
				1814	# endif
				1815	#endif
				1816
				1817	#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
				1818	u128 bitlen;
				1819	unsigned int mres = ctx->mres;
				1820
				1821	if (mres) {
				1822	unsigned blocks = (mres + 15) & -16;
				1823
				1824	memset(ctx->Xn + mres, 0, blocks - mres);
				1825	mres = blocks;
				1826	if (mres == sizeof(ctx->Xn)) {
				1827	GHASH(ctx, ctx->Xn, mres);
				1828	mres = 0;
				1829	}
				1830	} else if (ctx->ares) {
				1831	GCM_MUL(ctx);
				1832	}
				1833	#else
				1834	if (ctx->mres \|\| ctx->ares)
				1835	GCM_MUL(ctx);
				1836	#endif
				1837
				1838	if (is_endian.little) {
				1839	#ifdef BSWAP8
				1840	alen = BSWAP8(alen);
				1841	clen = BSWAP8(clen);
				1842	#else
				1843	u8 *p = ctx->len.c;
				1844
				1845	ctx->len.u[0] = alen;
				1846	ctx->len.u[1] = clen;
				1847
				1848	alen = (u64)GETU32(p) << 32 \| GETU32(p + 4);
				1849	clen = (u64)GETU32(p + 8) << 32 \| GETU32(p + 12);
				1850	#endif
				1851	}
				1852
				1853	#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
				1854	bitlen.hi = alen;
				1855	bitlen.lo = clen;
				1856	memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
				1857	mres += sizeof(bitlen);
				1858	GHASH(ctx, ctx->Xn, mres);
				1859	#else
				1860	ctx->Xi.u[0] ^= alen;
				1861	ctx->Xi.u[1] ^= clen;
				1862	GCM_MUL(ctx);
				1863	#endif
				1864
				1865	ctx->Xi.u[0] ^= ctx->EK0.u[0];
				1866	ctx->Xi.u[1] ^= ctx->EK0.u[1];
				1867
				1868	if (tag && len <= sizeof(ctx->Xi))
				1869	return CRYPTO_memcmp(ctx->Xi.c, tag, len);
				1870	else
				1871	return -1;
				1872	}
				1873
				1874	void CRYPTO_gcm128_tag(GCM128_CONTEXT ctx, unsigned char tag, size_t len)
				1875	{
				1876	CRYPTO_gcm128_finish(ctx, NULL, 0);
				1877	memcpy(tag, ctx->Xi.c,
				1878	len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
				1879	}
				1880
				1881	GCM128_CONTEXT CRYPTO_gcm128_new(void key, block128_f block)
				1882	{
				1883	GCM128_CONTEXT *ret;
				1884
				1885	if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
				1886	CRYPTO_gcm128_init(ret, key, block);
				1887
				1888	return ret;
				1889	}
				1890
				1891	void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
				1892	{
				1893	OPENSSL_clear_free(ctx, sizeof(*ctx));
				1894	}