src/crypto/gcm.c - ipxe - Git at Google

 /*
  * Copyright (C) 2022 Michael Brown <mbrown@fensystems.co.uk>.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation; either version 2 of the
  * License, or any later version.
  *
  * This program is distributed in the hope that it will be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  * 02110-1301, USA.
  *
  * You can also choose to distribute this program under the terms of
  * the Unmodified Binary Distribution Licence (as given in the file
  * COPYING.UBDL), provided that you have satisfied its requirements.
  */

 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );

 /** @file
  *
  * Galois/Counter Mode (GCM)
  *
  * The GCM algorithm is specified in
  *
  * https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf
  * https://csrc.nist.rip/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-spec.pdf
  *
  */

 #include <stdint.h>
 #include <string.h>
 #include <byteswap.h>
 #include <ipxe/crypto.h>
 #include <ipxe/gcm.h>

 /**
  * Perform encryption
  *
  * This value is chosen to allow for ANDing with a fragment length.
  */
 #define GCM_FL_ENCRYPT 0x00ff

 /**
  * Calculate hash over an initialisation vector value
  *
  * The hash calculation for a non 96-bit initialisation vector is
  * identical to the calculation used for additional data, except that
  * the non-additional data length counter is used.
  */
 #define GCM_FL_IV 0x0100

 /**
  * GCM field polynomial
  *
  * GCM treats 128-bit blocks as polynomials in GF(2^128) with the
  * field polynomial f(x) = 1 + x + x^2 + x^7 + x^128.
  *
  * In a somewhat bloody-minded interpretation of "big-endian", the
  * constant term (with degree zero) is arbitrarily placed in the
  * leftmost bit of the big-endian binary representation (i.e. the most
  * significant bit of byte 0), thereby failing to correspond to the
  * bit ordering in any CPU architecture in existence.  This
  * necessitates some wholly gratuitous byte reversals when
  * constructing the multiplication tables, since all CPUs will treat
  * bit 0 as being the least significant bit within a byte.
  *
  * The field polynomial maps to the 128-bit constant
  * 0xe1000000000000000000000000000000 (with the x^128 term outside the
  * 128-bit range), and can therefore be treated as a single-byte
  * value.
  */
 #define GCM_POLY 0xe1

 /**
  * Hash key for which multiplication tables are cached
  *
  * GCM operates much more efficiently with a cached multiplication
  * table, which costs 4kB per hash key.  Since this exceeds the
  * available stack space, we place a single 4kB cache in .bss and
  * recalculate the cached values as required.  In the common case of a
  * single HTTPS connection being used to download a (relatively) large
  * file, the same key will be used repeatedly for almost all GCM
  * operations, and so the overhead of recalculation is negligible.
  */
 static const union gcm_block *gcm_cached_key;

 /**
  * Cached multiplication table (M0) for Shoup's method
  *
  * Each entry within this table represents the result of multiplying
  * the cached hash key by an arbitrary 8-bit polynomial.
  */
 static union gcm_block gcm_cached_mult[256];

 /**
  * Cached reduction table (R) for Shoup's method
  *
  * Each entry within this table represents the result of multiplying
  * the fixed polynomial x^128 by an arbitrary 8-bit polynomial.  Only
  * the leftmost 16 bits are stored, since all other bits within the
  * result will always be zero.
  */
 static uint16_t gcm_cached_reduce[256];

 /** Offset of a field within GCM context */
 #define gcm_offset( field ) offsetof ( struct gcm_context, field )

 /**
  * Reverse bits in a byte
  *
  * @v byte		Byte
  * @ret etyb		Bit-reversed byte
  */
 static inline __attribute__ (( always_inline )) uint8_t
 gcm_reverse ( const uint8_t byte ) {
 	uint8_t etyb = etyb;
 	uint8_t mask;

 	for ( mask = 1 ; mask ; mask <<= 1 ) {
 		etyb <<= 1;
 		if ( byte & mask )
 			etyb |= 1;
 	}
 	return etyb;
 }

 /**
  * Update GCM counter
  *
  * @v ctr		Counter
  * @v delta		Amount to add to counter
  */
 static inline __attribute__ (( always_inline )) void
 gcm_count ( union gcm_block *ctr, uint32_t delta ) {
 	uint32_t *value = &ctr->ctr.value;

 	/* Update counter modulo 2^32 */
 	*value = cpu_to_be32 ( be32_to_cpu ( *value ) + delta );
 }

 /**
  * XOR partial data block
  *
  * @v src1		Source buffer 1
  * @v src2		Source buffer 2
  * @v dst		Destination buffer
  * @v len		Length
  */
 static inline void gcm_xor ( const void *src1, const void *src2, void *dst,
 			     size_t len ) {
 	uint8_t *dst_bytes = dst;
 	const uint8_t *src1_bytes = src1;
 	const uint8_t *src2_bytes = src2;

 	/* XOR one byte at a time */
 	while ( len-- )
 		*(dst_bytes++) = ( *(src1_bytes++) ^ *(src2_bytes++) );
 }

 /**
  * XOR whole data block in situ
  *
  * @v src		Source block
  * @v dst		Destination block
  */
 static inline void gcm_xor_block ( const union gcm_block *src,
 				   union gcm_block *dst ) {

 	/* XOR whole dwords */
 	dst->dword[0] ^= src->dword[0];
 	dst->dword[1] ^= src->dword[1];
 	dst->dword[2] ^= src->dword[2];
 	dst->dword[3] ^= src->dword[3];
 }

 /**
  * Multiply polynomial by (x)
  *
  * @v mult		Multiplicand
  * @v res		Result
  */
 static void gcm_multiply_x ( const union gcm_block *mult,
 			     union gcm_block *res ) {
 	unsigned int i;
 	uint8_t byte;
 	uint8_t carry;

 	/* Multiply by (x) by shifting all bits rightward */
 	for ( i = 0, carry = 0 ; i < sizeof ( res->byte ) ; i++ ) {
 		byte = mult->byte[i];
 		res->byte[i] = ( ( carry << 7 ) | ( byte >> 1 ) );
 		carry = ( byte & 0x01 );
 	}

 	/* If result overflows, reduce modulo the field polynomial */
 	if ( carry )
 		res->byte[0] ^= GCM_POLY;
 }

 /**
  * Construct cached tables
  *
  * @v key		Hash key
  * @v context		Context
  */
 static void gcm_cache ( const union gcm_block *key ) {
 	union gcm_block *mult;
 	uint16_t reduce;
 	unsigned int this;
 	unsigned int other;
 	unsigned int i;

 	/* Calculate M0[1..255] and R[1..255]
 	 *
 	 * The R[] values are independent of the key, but the overhead
 	 * of recalculating them here is negligible and saves on
 	 * overall code size since the calculations are related.
 	 */
 	for ( i = 1 ; i < 256 ; i++ ) {

 		/* Reverse bit order to compensate for poor life choices */
 		this = gcm_reverse ( i );

 		/* Construct entries */
 		mult = &gcm_cached_mult[this];
 		if ( this & 0x80 ) {

 			/* Odd number: entry[i] = entry[i - 1] + poly */
 			other = ( this & 0x7f ); /* bit-reversed (i - 1) */
 			gcm_xor ( key, &gcm_cached_mult[other], mult,
 				  sizeof ( *mult ) );
 			reduce = gcm_cached_reduce[other];
 			reduce ^= be16_to_cpu ( GCM_POLY << 8 );
 			gcm_cached_reduce[this] = reduce;

 		} else {

 			/* Even number: entry[i] = entry[i/2] * (x) */
 			other = ( this << 1 ); /* bit-reversed (i / 2) */
 			gcm_multiply_x ( &gcm_cached_mult[other], mult );
 			reduce = be16_to_cpu ( gcm_cached_reduce[other] );
 			reduce >>= 1;
 			gcm_cached_reduce[this] = cpu_to_be16 ( reduce );
 		}
 	}

 	/* Record cached key */
 	gcm_cached_key = key;
 }

 /**
  * Multiply polynomial by (x^8) in situ
  *
  * @v poly		Multiplicand and result
  */
 static void gcm_multiply_x_8 ( union gcm_block *poly ) {
 	uint8_t *byte;
 	uint8_t msb;

 	/* Reduction table must already have been calculated */
 	assert ( gcm_cached_key != NULL );

 	/* Record most significant byte */
 	byte = &poly->byte[ sizeof ( poly->byte ) - 1 ];
 	msb = *byte;

 	/* Multiply least significant bytes by shifting */
 	for ( ; byte > &poly->byte[0] ; byte-- )
 		*byte = *( byte - 1 );
 	*byte = 0;

 	/* Multiply most significant byte via reduction table */
 	poly->word[0] ^= gcm_cached_reduce[msb];
 }

 /**
  * Multiply polynomial by hash key in situ
  *
  * @v key		Hash key
  * @v poly		Multiplicand and result
  */
 static void gcm_multiply_key ( const union gcm_block *key,
 			       union gcm_block *poly ) {
 	union gcm_block res;
 	uint8_t *byte;

 	/* Construct tables, if necessary */
 	if ( gcm_cached_key != key )
 		gcm_cache ( key );

 	/* Multiply using Shoup's algorithm */
 	byte = &poly->byte[ sizeof ( poly->byte ) - 1 ];
 	memcpy ( &res, &gcm_cached_mult[ *byte ], sizeof ( res ) );
 	for ( byte-- ; byte >= &poly->byte[0] ; byte-- ) {
 		gcm_multiply_x_8 ( &res );
 		gcm_xor_block ( &gcm_cached_mult[ *byte ], &res );
 	}

 	/* Overwrite result */
 	memcpy ( poly, &res, sizeof ( *poly ) );
 }

 /**
  * Encrypt/decrypt/authenticate data
  *
  * @v context		Context
  * @v src		Input data
  * @v dst		Output data, or NULL to process additional data
  * @v len		Length of data
  * @v flags		Operation flags
  */
 static void gcm_process ( struct gcm_context *context, const void *src,
 			  void *dst, size_t len, unsigned int flags ) {
 	union gcm_block tmp;
 	uint64_t *total;
 	size_t frag_len;
 	unsigned int block;

 	/* Calculate block number (for debugging) */
 	block = ( ( ( context->len.len.add + 8 * sizeof ( tmp ) - 1 ) /
 		    ( 8 * sizeof ( tmp ) ) ) +
 		  ( ( context->len.len.data + 8 * sizeof ( tmp ) - 1 ) /
 		    ( 8 * sizeof ( tmp ) ) ) + 1 );

 	/* Update total length (in bits) */
 	total = ( ( dst || ( flags & GCM_FL_IV ) ) ?
 		  &context->len.len.data : &context->len.len.add );
 	*total += ( len * 8 );

 	/* Process data */
 	for ( ; len ; src += frag_len, len -= frag_len, block++ ) {

 		/* Calculate fragment length */
 		frag_len = len;
 		if ( frag_len > sizeof ( tmp ) )
 			frag_len = sizeof ( tmp );

 		/* Update hash with input data */
 		gcm_xor ( src, &context->hash, &context->hash, frag_len );

 		/* Encrypt/decrypt block, if applicable */
 		if ( dst ) {

 			/* Increment counter */
 			gcm_count ( &context->ctr, 1 );

 			/* Encrypt counter */
 			DBGC2 ( context, "GCM %p Y[%d]:\n", context, block );
 			DBGC2_HDA ( context, 0, &context->ctr,
 				    sizeof ( context->ctr ) );
 			cipher_encrypt ( context->raw_cipher, &context->raw_ctx,
 					 &context->ctr, &tmp, sizeof ( tmp ) );
 			DBGC2 ( context, "GCM %p E(K,Y[%d]):\n",
 				context, block );
 			DBGC2_HDA ( context, 0, &tmp, sizeof ( tmp ) );

 			/* Encrypt/decrypt data */
 			gcm_xor ( src, &tmp, dst, frag_len );
 			dst += frag_len;

 			/* Update hash with encrypted data, if applicable */
 			gcm_xor ( &tmp, &context->hash, &context->hash,
 				  ( frag_len & flags ) );
 		}

 		/* Update hash */
 		gcm_multiply_key ( &context->key, &context->hash );
 		DBGC2 ( context, "GCM %p X[%d]:\n", context, block );
 		DBGC2_HDA ( context, 0, &context->hash,
 			    sizeof ( context->hash ) );
 	}
 }

 /**
  * Construct hash
  *
  * @v context		Context
  * @v hash		Hash to fill in
  */
 static void gcm_hash ( struct gcm_context *context, union gcm_block *hash ) {

 	/* Construct big-endian lengths block */
 	hash->len.add = cpu_to_be64 ( context->len.len.add );
 	hash->len.data = cpu_to_be64 ( context->len.len.data );
 	DBGC2 ( context, "GCM %p len(A)||len(C):\n", context );
 	DBGC2_HDA ( context, 0, hash, sizeof ( *hash ) );

 	/* Update hash */
 	gcm_xor_block ( &context->hash, hash );
 	gcm_multiply_key ( &context->key, hash );
 	DBGC2 ( context, "GCM %p GHASH(H,A,C):\n", context );
 	DBGC2_HDA ( context, 0, hash, sizeof ( *hash ) );
 }

 /**
  * Construct tag
  *
  * @v context		Context
  * @v tag		Tag
  */
 void gcm_tag ( struct gcm_context *context, union gcm_block *tag ) {
 	union gcm_block tmp;
 	uint32_t offset;

 	/* Construct hash */
 	gcm_hash ( context, tag );

 	/* Construct encrypted initial counter value */
 	memcpy ( &tmp, &context->ctr, sizeof ( tmp ) );
 	offset = ( ( -context->len.len.data ) / ( 8 * sizeof ( tmp ) ) );
 	gcm_count ( &tmp, offset );
 	cipher_encrypt ( context->raw_cipher, &context->raw_ctx, &tmp,
 			 &tmp, sizeof ( tmp ) );
 	DBGC2 ( context, "GCM %p E(K,Y[0]):\n", context );
 	DBGC2_HDA ( context, 0, &tmp, sizeof ( tmp ) );

 	/* Construct tag */
 	gcm_xor_block ( &tmp, tag );
 	DBGC2 ( context, "GCM %p T:\n", context );
 	DBGC2_HDA ( context, 0, tag, sizeof ( *tag ) );
 }

 /**
  * Set key
  *
  * @v context		Context
  * @v key		Key
  * @v keylen		Key length
  * @v raw_cipher	Underlying cipher
  * @ret rc		Return status code
  */
 int gcm_setkey ( struct gcm_context *context, const void *key, size_t keylen,
 		 struct cipher_algorithm *raw_cipher ) {
 	int rc;

 	/* Initialise GCM context */
 	memset ( context, 0, sizeof ( *context ) );
 	context->raw_cipher = raw_cipher;

 	/* Set underlying block cipher key */
 	if ( ( rc = cipher_setkey ( raw_cipher, context->raw_ctx, key,
 				    keylen ) ) != 0 )
 		return rc;

 	/* Construct GCM hash key */
 	cipher_encrypt ( raw_cipher, context->raw_ctx, &context->ctr,
 			 &context->key, sizeof ( context->key ) );
 	DBGC2 ( context, "GCM %p H:\n", context );
 	DBGC2_HDA ( context, 0, &context->key, sizeof ( context->key ) );

 	/* Reset counter */
 	context->ctr.ctr.value = cpu_to_be32 ( 1 );

 	/* Construct cached tables */
 	gcm_cache ( &context->key );

 	return 0;
 }

 /**
  * Set initialisation vector
  *
  * @v ctx		Context
  * @v iv		Initialisation vector
  * @v ivlen		Initialisation vector length
  */
 void gcm_setiv ( struct gcm_context *context, const void *iv, size_t ivlen ) {

 	/* Reset non-key state */
 	memset ( context, 0, gcm_offset ( key ) );
 	build_assert ( gcm_offset ( key ) > gcm_offset ( hash ) );
 	build_assert ( gcm_offset ( key ) > gcm_offset ( len ) );
 	build_assert ( gcm_offset ( key ) > gcm_offset ( ctr ) );
 	build_assert ( gcm_offset ( key ) < gcm_offset ( raw_cipher ) );
 	build_assert ( gcm_offset ( key ) < gcm_offset ( raw_ctx ) );

 	/* Reset counter */
 	context->ctr.ctr.value = cpu_to_be32 ( 1 );

 	/* Process initialisation vector */
 	if ( ivlen == sizeof ( context->ctr.ctr.iv ) ) {

 		/* Initialisation vector is exactly 96 bits, use it as-is */
 		memcpy ( context->ctr.ctr.iv, iv, ivlen );

 	} else {

 		/* Calculate hash over initialisation vector */
 		gcm_process ( context, iv, NULL, ivlen, GCM_FL_IV );
 		gcm_hash ( context, &context->ctr );
 		assert ( context->len.len.add == 0 );

 		/* Reset non-key, non-counter state */
 		memset ( context, 0, gcm_offset ( ctr ) );
 		build_assert ( gcm_offset ( ctr ) > gcm_offset ( hash ) );
 		build_assert ( gcm_offset ( ctr ) > gcm_offset ( len ) );
 		build_assert ( gcm_offset ( ctr ) < gcm_offset ( key ) );
 		build_assert ( gcm_offset ( ctr ) < gcm_offset ( raw_cipher ) );
 		build_assert ( gcm_offset ( ctr ) < gcm_offset ( raw_ctx ) );
 	}

 	DBGC2 ( context, "GCM %p Y[0]:\n", context );
 	DBGC2_HDA ( context, 0, &context->ctr, sizeof ( context->ctr ) );
 }

 /**
  * Encrypt data
  *
  * @v context		Context
  * @v src		Data to encrypt
  * @v dst		Buffer for encrypted data, or NULL for additional data
  * @v len		Length of data
  */
 void gcm_encrypt ( struct gcm_context *context, const void *src, void *dst,
 		   size_t len ) {

 	/* Process data */
 	gcm_process ( context, src, dst, len, GCM_FL_ENCRYPT );
 }

 /**
  * Decrypt data
  *
  * @v context		Context
  * @v src		Data to decrypt
  * @v dst		Buffer for decrypted data, or NULL for additional data
  * @v len		Length of data
  */
 void gcm_decrypt ( struct gcm_context *context, const void *src, void *dst,
 		   size_t len ) {

 	/* Process data */
 	gcm_process ( context, src, dst, len, 0 );
 }
	/*
	* Copyright (C) 2022 Michael Brown <mbrown@fensystems.co.uk>.
	*
	* This program is free software; you can redistribute it and/or
	* modify it under the terms of the GNU General Public License as
	* published by the Free Software Foundation; either version 2 of the
	* License, or any later version.
	*
	* This program is distributed in the hope that it will be useful, but
	* WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
	* 02110-1301, USA.
	*
	* You can also choose to distribute this program under the terms of
	* the Unmodified Binary Distribution Licence (as given in the file
	* COPYING.UBDL), provided that you have satisfied its requirements.
	*/

	FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );

	/** @file
	*
	* Galois/Counter Mode (GCM)
	*
	* The GCM algorithm is specified in
	*
	* https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf
	* https://csrc.nist.rip/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-spec.pdf
	*
	*/

	#include <stdint.h>
	#include <string.h>
	#include <byteswap.h>
	#include <ipxe/crypto.h>
	#include <ipxe/gcm.h>

	/**
	* Perform encryption
	*
	* This value is chosen to allow for ANDing with a fragment length.
	*/
	#define GCM_FL_ENCRYPT 0x00ff

	/**
	* Calculate hash over an initialisation vector value
	*
	* The hash calculation for a non 96-bit initialisation vector is
	* identical to the calculation used for additional data, except that
	* the non-additional data length counter is used.
	*/
	#define GCM_FL_IV 0x0100

	/**
	* GCM field polynomial
	*
	* GCM treats 128-bit blocks as polynomials in GF(2^128) with the
	* field polynomial f(x) = 1 + x + x^2 + x^7 + x^128.
	*
	* In a somewhat bloody-minded interpretation of "big-endian", the
	* constant term (with degree zero) is arbitrarily placed in the
	* leftmost bit of the big-endian binary representation (i.e. the most
	* significant bit of byte 0), thereby failing to correspond to the
	* bit ordering in any CPU architecture in existence. This
	* necessitates some wholly gratuitous byte reversals when
	* constructing the multiplication tables, since all CPUs will treat
	* bit 0 as being the least significant bit within a byte.
	*
	* The field polynomial maps to the 128-bit constant
	* 0xe1000000000000000000000000000000 (with the x^128 term outside the
	* 128-bit range), and can therefore be treated as a single-byte
	* value.
	*/
	#define GCM_POLY 0xe1

	/**
	* Hash key for which multiplication tables are cached
	*
	* GCM operates much more efficiently with a cached multiplication
	* table, which costs 4kB per hash key. Since this exceeds the
	* available stack space, we place a single 4kB cache in .bss and
	* recalculate the cached values as required. In the common case of a
	* single HTTPS connection being used to download a (relatively) large
	* file, the same key will be used repeatedly for almost all GCM
	* operations, and so the overhead of recalculation is negligible.
	*/
	static const union gcm_block *gcm_cached_key;

	/**
	* Cached multiplication table (M0) for Shoup's method
	*
	* Each entry within this table represents the result of multiplying
	* the cached hash key by an arbitrary 8-bit polynomial.
	*/
	static union gcm_block gcm_cached_mult[256];

	/**
	* Cached reduction table (R) for Shoup's method
	*
	* Each entry within this table represents the result of multiplying
	* the fixed polynomial x^128 by an arbitrary 8-bit polynomial. Only
	* the leftmost 16 bits are stored, since all other bits within the
	* result will always be zero.
	*/
	static uint16_t gcm_cached_reduce[256];

	/** Offset of a field within GCM context */
	#define gcm_offset( field ) offsetof ( struct gcm_context, field )

	/**
	* Reverse bits in a byte
	*
	* @v byte Byte
	* @ret etyb Bit-reversed byte
	*/
	static inline __attribute__ (( always_inline )) uint8_t
	gcm_reverse ( const uint8_t byte ) {
	uint8_t etyb = etyb;
	uint8_t mask;

	for ( mask = 1 ; mask ; mask <<= 1 ) {
	etyb <<= 1;
	if ( byte & mask )
	etyb \|= 1;
	}
	return etyb;
	}

	/**
	* Update GCM counter
	*
	* @v ctr Counter
	* @v delta Amount to add to counter
	*/
	static inline __attribute__ (( always_inline )) void
	gcm_count ( union gcm_block *ctr, uint32_t delta ) {
	uint32_t *value = &ctr->ctr.value;

	/* Update counter modulo 2^32 */
	value = cpu_to_be32 ( be32_to_cpu ( value ) + delta );
	}

	/**
	* XOR partial data block
	*
	* @v src1 Source buffer 1
	* @v src2 Source buffer 2
	* @v dst Destination buffer
	* @v len Length
	*/
	static inline void gcm_xor ( const void src1, const void src2, void *dst,
	size_t len ) {
	uint8_t *dst_bytes = dst;
	const uint8_t *src1_bytes = src1;
	const uint8_t *src2_bytes = src2;

	/* XOR one byte at a time */
	while ( len-- )
	(dst_bytes++) = ( (src1_bytes++) ^ *(src2_bytes++) );
	}

	/**
	* XOR whole data block in situ
	*
	* @v src Source block
	* @v dst Destination block
	*/
	static inline void gcm_xor_block ( const union gcm_block *src,
	union gcm_block *dst ) {

	/* XOR whole dwords */
	dst->dword[0] ^= src->dword[0];
	dst->dword[1] ^= src->dword[1];
	dst->dword[2] ^= src->dword[2];
	dst->dword[3] ^= src->dword[3];
	}

	/**
	* Multiply polynomial by (x)
	*
	* @v mult Multiplicand
	* @v res Result
	*/
	static void gcm_multiply_x ( const union gcm_block *mult,
	union gcm_block *res ) {
	unsigned int i;
	uint8_t byte;
	uint8_t carry;

	/* Multiply by (x) by shifting all bits rightward */
	for ( i = 0, carry = 0 ; i < sizeof ( res->byte ) ; i++ ) {
	byte = mult->byte[i];
	res->byte[i] = ( ( carry << 7 ) \| ( byte >> 1 ) );
	carry = ( byte & 0x01 );
	}

	/* If result overflows, reduce modulo the field polynomial */
	if ( carry )
	res->byte[0] ^= GCM_POLY;
	}

	/**
	* Construct cached tables
	*
	* @v key Hash key
	* @v context Context
	*/
	static void gcm_cache ( const union gcm_block *key ) {
	union gcm_block *mult;
	uint16_t reduce;
	unsigned int this;
	unsigned int other;
	unsigned int i;

	/* Calculate M0[1..255] and R[1..255]
	*
	* The R[] values are independent of the key, but the overhead
	* of recalculating them here is negligible and saves on
	* overall code size since the calculations are related.
	*/
	for ( i = 1 ; i < 256 ; i++ ) {

	/* Reverse bit order to compensate for poor life choices */
	this = gcm_reverse ( i );

	/* Construct entries */
	mult = &gcm_cached_mult[this];
	if ( this & 0x80 ) {

	/* Odd number: entry[i] = entry[i - 1] + poly */
	other = ( this & 0x7f ); /* bit-reversed (i - 1) */
	gcm_xor ( key, &gcm_cached_mult[other], mult,
	sizeof ( *mult ) );
	reduce = gcm_cached_reduce[other];
	reduce ^= be16_to_cpu ( GCM_POLY << 8 );
	gcm_cached_reduce[this] = reduce;

	} else {

	/* Even number: entry[i] = entry[i/2] * (x) */
	other = ( this << 1 ); /* bit-reversed (i / 2) */
	gcm_multiply_x ( &gcm_cached_mult[other], mult );
	reduce = be16_to_cpu ( gcm_cached_reduce[other] );
	reduce >>= 1;
	gcm_cached_reduce[this] = cpu_to_be16 ( reduce );
	}
	}

	/* Record cached key */
	gcm_cached_key = key;
	}

	/**
	* Multiply polynomial by (x^8) in situ
	*
	* @v poly Multiplicand and result
	*/
	static void gcm_multiply_x_8 ( union gcm_block *poly ) {
	uint8_t *byte;
	uint8_t msb;

	/* Reduction table must already have been calculated */
	assert ( gcm_cached_key != NULL );

	/* Record most significant byte */
	byte = &poly->byte[ sizeof ( poly->byte ) - 1 ];
	msb = *byte;

	/* Multiply least significant bytes by shifting */
	for ( ; byte > &poly->byte[0] ; byte-- )
	byte = ( byte - 1 );
	*byte = 0;

	/* Multiply most significant byte via reduction table */
	poly->word[0] ^= gcm_cached_reduce[msb];
	}

	/**
	* Multiply polynomial by hash key in situ
	*
	* @v key Hash key
	* @v poly Multiplicand and result
	*/
	static void gcm_multiply_key ( const union gcm_block *key,
	union gcm_block *poly ) {
	union gcm_block res;
	uint8_t *byte;

	/* Construct tables, if necessary */
	if ( gcm_cached_key != key )
	gcm_cache ( key );

	/* Multiply using Shoup's algorithm */
	byte = &poly->byte[ sizeof ( poly->byte ) - 1 ];
	memcpy ( &res, &gcm_cached_mult[ *byte ], sizeof ( res ) );
	for ( byte-- ; byte >= &poly->byte[0] ; byte-- ) {
	gcm_multiply_x_8 ( &res );
	gcm_xor_block ( &gcm_cached_mult[ *byte ], &res );
	}

	/* Overwrite result */
	memcpy ( poly, &res, sizeof ( *poly ) );
	}

	/**
	* Encrypt/decrypt/authenticate data
	*
	* @v context Context
	* @v src Input data
	* @v dst Output data, or NULL to process additional data
	* @v len Length of data
	* @v flags Operation flags
	*/
	static void gcm_process ( struct gcm_context context, const void src,
	void *dst, size_t len, unsigned int flags ) {
	union gcm_block tmp;
	uint64_t *total;
	size_t frag_len;
	unsigned int block;

	/* Calculate block number (for debugging) */
	block = ( ( ( context->len.len.add + 8 * sizeof ( tmp ) - 1 ) /
	( 8 * sizeof ( tmp ) ) ) +
	( ( context->len.len.data + 8 * sizeof ( tmp ) - 1 ) /
	( 8 * sizeof ( tmp ) ) ) + 1 );

	/* Update total length (in bits) */
	total = ( ( dst \|\| ( flags & GCM_FL_IV ) ) ?
	&context->len.len.data : &context->len.len.add );
	total += ( len 8 );

	/* Process data */
	for ( ; len ; src += frag_len, len -= frag_len, block++ ) {

	/* Calculate fragment length */
	frag_len = len;
	if ( frag_len > sizeof ( tmp ) )
	frag_len = sizeof ( tmp );

	/* Update hash with input data */
	gcm_xor ( src, &context->hash, &context->hash, frag_len );

	/* Encrypt/decrypt block, if applicable */
	if ( dst ) {

	/* Increment counter */
	gcm_count ( &context->ctr, 1 );

	/* Encrypt counter */
	DBGC2 ( context, "GCM %p Y[%d]:\n", context, block );
	DBGC2_HDA ( context, 0, &context->ctr,
	sizeof ( context->ctr ) );
	cipher_encrypt ( context->raw_cipher, &context->raw_ctx,
	&context->ctr, &tmp, sizeof ( tmp ) );
	DBGC2 ( context, "GCM %p E(K,Y[%d]):\n",
	context, block );
	DBGC2_HDA ( context, 0, &tmp, sizeof ( tmp ) );

	/* Encrypt/decrypt data */
	gcm_xor ( src, &tmp, dst, frag_len );
	dst += frag_len;

	/* Update hash with encrypted data, if applicable */
	gcm_xor ( &tmp, &context->hash, &context->hash,
	( frag_len & flags ) );
	}

	/* Update hash */
	gcm_multiply_key ( &context->key, &context->hash );
	DBGC2 ( context, "GCM %p X[%d]:\n", context, block );
	DBGC2_HDA ( context, 0, &context->hash,
	sizeof ( context->hash ) );
	}
	}

	/**
	* Construct hash
	*
	* @v context Context
	* @v hash Hash to fill in
	*/
	static void gcm_hash ( struct gcm_context context, union gcm_block hash ) {

	/* Construct big-endian lengths block */
	hash->len.add = cpu_to_be64 ( context->len.len.add );
	hash->len.data = cpu_to_be64 ( context->len.len.data );
	DBGC2 ( context, "GCM %p len(A)\|\|len(C):\n", context );
	DBGC2_HDA ( context, 0, hash, sizeof ( *hash ) );

	/* Update hash */
	gcm_xor_block ( &context->hash, hash );
	gcm_multiply_key ( &context->key, hash );
	DBGC2 ( context, "GCM %p GHASH(H,A,C):\n", context );
	DBGC2_HDA ( context, 0, hash, sizeof ( *hash ) );
	}

	/**
	* Construct tag
	*
	* @v context Context
	* @v tag Tag
	*/
	void gcm_tag ( struct gcm_context context, union gcm_block tag ) {
	union gcm_block tmp;
	uint32_t offset;

	/* Construct hash */
	gcm_hash ( context, tag );

	/* Construct encrypted initial counter value */
	memcpy ( &tmp, &context->ctr, sizeof ( tmp ) );
	offset = ( ( -context->len.len.data ) / ( 8 * sizeof ( tmp ) ) );
	gcm_count ( &tmp, offset );
	cipher_encrypt ( context->raw_cipher, &context->raw_ctx, &tmp,
	&tmp, sizeof ( tmp ) );
	DBGC2 ( context, "GCM %p E(K,Y[0]):\n", context );
	DBGC2_HDA ( context, 0, &tmp, sizeof ( tmp ) );

	/* Construct tag */
	gcm_xor_block ( &tmp, tag );
	DBGC2 ( context, "GCM %p T:\n", context );
	DBGC2_HDA ( context, 0, tag, sizeof ( *tag ) );
	}

	/**
	* Set key
	*
	* @v context Context
	* @v key Key
	* @v keylen Key length
	* @v raw_cipher Underlying cipher
	* @ret rc Return status code
	*/
	int gcm_setkey ( struct gcm_context context, const void key, size_t keylen,
	struct cipher_algorithm *raw_cipher ) {
	int rc;

	/* Initialise GCM context */
	memset ( context, 0, sizeof ( *context ) );
	context->raw_cipher = raw_cipher;

	/* Set underlying block cipher key */
	if ( ( rc = cipher_setkey ( raw_cipher, context->raw_ctx, key,
	keylen ) ) != 0 )
	return rc;

	/* Construct GCM hash key */
	cipher_encrypt ( raw_cipher, context->raw_ctx, &context->ctr,
	&context->key, sizeof ( context->key ) );
	DBGC2 ( context, "GCM %p H:\n", context );
	DBGC2_HDA ( context, 0, &context->key, sizeof ( context->key ) );

	/* Reset counter */
	context->ctr.ctr.value = cpu_to_be32 ( 1 );

	/* Construct cached tables */
	gcm_cache ( &context->key );

	return 0;
	}

	/**
	* Set initialisation vector
	*
	* @v ctx Context
	* @v iv Initialisation vector
	* @v ivlen Initialisation vector length
	*/
	void gcm_setiv ( struct gcm_context context, const void iv, size_t ivlen ) {

	/* Reset non-key state */
	memset ( context, 0, gcm_offset ( key ) );
	build_assert ( gcm_offset ( key ) > gcm_offset ( hash ) );
	build_assert ( gcm_offset ( key ) > gcm_offset ( len ) );
	build_assert ( gcm_offset ( key ) > gcm_offset ( ctr ) );
	build_assert ( gcm_offset ( key ) < gcm_offset ( raw_cipher ) );
	build_assert ( gcm_offset ( key ) < gcm_offset ( raw_ctx ) );

	/* Reset counter */
	context->ctr.ctr.value = cpu_to_be32 ( 1 );

	/* Process initialisation vector */
	if ( ivlen == sizeof ( context->ctr.ctr.iv ) ) {

	/* Initialisation vector is exactly 96 bits, use it as-is */
	memcpy ( context->ctr.ctr.iv, iv, ivlen );

	} else {

	/* Calculate hash over initialisation vector */
	gcm_process ( context, iv, NULL, ivlen, GCM_FL_IV );
	gcm_hash ( context, &context->ctr );
	assert ( context->len.len.add == 0 );

	/* Reset non-key, non-counter state */
	memset ( context, 0, gcm_offset ( ctr ) );
	build_assert ( gcm_offset ( ctr ) > gcm_offset ( hash ) );
	build_assert ( gcm_offset ( ctr ) > gcm_offset ( len ) );
	build_assert ( gcm_offset ( ctr ) < gcm_offset ( key ) );
	build_assert ( gcm_offset ( ctr ) < gcm_offset ( raw_cipher ) );
	build_assert ( gcm_offset ( ctr ) < gcm_offset ( raw_ctx ) );
	}

	DBGC2 ( context, "GCM %p Y[0]:\n", context );
	DBGC2_HDA ( context, 0, &context->ctr, sizeof ( context->ctr ) );
	}

	/**
	* Encrypt data
	*
	* @v context Context
	* @v src Data to encrypt
	* @v dst Buffer for encrypted data, or NULL for additional data
	* @v len Length of data
	*/
	void gcm_encrypt ( struct gcm_context context, const void src, void *dst,
	size_t len ) {

	/* Process data */
	gcm_process ( context, src, dst, len, GCM_FL_ENCRYPT );
	}

	/**
	* Decrypt data
	*
	* @v context Context
	* @v src Data to decrypt
	* @v dst Buffer for decrypted data, or NULL for additional data
	* @v len Length of data
	*/
	void gcm_decrypt ( struct gcm_context context, const void src, void *dst,
	size_t len ) {

	/* Process data */
	gcm_process ( context, src, dst, len, 0 );
	}