blob: 4ee01baf9d94c8e86176b1b217b31ad9400845b5 [file] [log] [blame]
/*
* Copyright (C) 2022 Michael Brown <mbrown@fensystems.co.uk>.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*
* You can also choose to distribute this program under the terms of
* the Unmodified Binary Distribution Licence (as given in the file
* COPYING.UBDL), provided that you have satisfied its requirements.
*/
FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
#include <stdint.h>
#include <assert.h>
#include <ipxe/utf8.h>
/** @file
*
* UTF-8 Unicode encoding
*
*/
/**
* Accumulate Unicode character from UTF-8 byte sequence
*
* @v utf8 UTF-8 accumulator
* @v byte UTF-8 byte
* @ret character Unicode character, or 0 if incomplete
*/
unsigned int utf8_accumulate ( struct utf8_accumulator *utf8, uint8_t byte ) {
static unsigned int min[] = {
UTF8_MIN_TWO,
UTF8_MIN_THREE,
UTF8_MIN_FOUR,
};
unsigned int shift;
unsigned int len;
uint8_t tmp;
/* Handle continuation bytes */
if ( UTF8_IS_CONTINUATION ( byte ) ) {
/* Fail if this is an unexpected continuation byte */
if ( utf8->remaining == 0 ) {
DBGC ( utf8, "UTF8 %p unexpected %02x\n", utf8, byte );
return UTF8_INVALID;
}
/* Apply continuation byte */
utf8->character <<= UTF8_CONTINUATION_BITS;
utf8->character |= ( byte & UTF8_CONTINUATION_MASK );
/* Return 0 if more continuation bytes are expected */
if ( --utf8->remaining != 0 )
return 0;
/* Fail if sequence is illegal */
if ( utf8->character < utf8->min ) {
DBGC ( utf8, "UTF8 %p illegal %02x\n", utf8,
utf8->character );
return UTF8_INVALID;
}
/* Sanity check */
assert ( utf8->character != 0 );
/* Return completed character */
DBGC2 ( utf8, "UTF8 %p accumulated %02x\n",
utf8, utf8->character );
return utf8->character;
}
/* Reset state and report failure if this is an unexpected
* non-continuation byte. Do not return UTF8_INVALID since
* doing so could cause us to drop a valid ASCII character.
*/
if ( utf8->remaining != 0 ) {
shift = ( utf8->remaining * UTF8_CONTINUATION_BITS );
DBGC ( utf8, "UTF8 %p unexpected %02x (partial %02x/%02x)\n",
utf8, byte, ( utf8->character << shift ),
( ( 1 << shift ) - 1 ) );
utf8->remaining = 0;
}
/* Handle initial bytes */
if ( ! UTF8_IS_ASCII ( byte ) ) {
/* Sanity check */
assert ( utf8->remaining == 0 );
/* Count total number of bytes in sequence */
tmp = byte;
len = 0;
while ( tmp & UTF8_HIGH_BIT ) {
tmp <<= 1;
len++;
}
/* Check for illegal length */
if ( len > UTF8_MAX_LEN ) {
DBGC ( utf8, "UTF8 %p illegal %02x length %d\n",
utf8, byte, len );
return UTF8_INVALID;
}
/* Store initial bits of character */
utf8->character = ( tmp >> len );
/* Store number of bytes remaining */
len--;
utf8->remaining = len;
assert ( utf8->remaining > 0 );
/* Store minimum legal value */
utf8->min = min[ len - 1 ];
assert ( utf8->min > 0 );
/* Await continuation bytes */
return 0;
}
/* Handle ASCII bytes */
return byte;
}