blob: 8580bc598b3391f80854b0b0971470d62638b26e [file] [log] [blame]
Markus Armbrustercb2744e2013-04-11 18:07:18 +02001/*
2 * Dealing with Unicode
3 *
4 * Copyright (C) 2013 Red Hat, Inc.
5 *
6 * Authors:
7 * Markus Armbruster <armbru@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
11 */
12
Peter Maydellaafd7582016-01-29 17:49:55 +000013#include "qemu/osdep.h"
Veronia Bahaaf348b6d2016-03-20 19:16:19 +020014#include "qemu/unicode.h"
Markus Armbrustercb2744e2013-04-11 18:07:18 +020015
Markus Armbrustere59f39d2018-08-23 18:39:49 +020016static bool is_valid_codepoint(int codepoint)
17{
18 if (codepoint > 0x10FFFFu) {
19 return false; /* beyond Unicode range */
20 }
21 if ((codepoint >= 0xFDD0 && codepoint <= 0xFDEF)
22 || (codepoint & 0xFFFE) == 0xFFFE) {
23 return false; /* noncharacter */
24 }
25 if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
26 return false; /* surrogate code point */
27 }
28 return true;
29}
30
Markus Armbrustercb2744e2013-04-11 18:07:18 +020031/**
32 * mod_utf8_codepoint:
33 * @s: string encoded in modified UTF-8
34 * @n: maximum number of bytes to read from @s, if less than 6
35 * @end: set to end of sequence on return
36 *
37 * Convert the modified UTF-8 sequence at the start of @s. Modified
38 * UTF-8 is exactly like UTF-8, except U+0000 is encoded as
39 * "\xC0\x80".
40 *
41 * If @n is zero or @s points to a zero byte, the sequence is invalid,
42 * and @end is set to @s.
43 *
44 * If @s points to an impossible byte (0xFE or 0xFF) or a continuation
45 * byte, the sequence is invalid, and @end is set to @s + 1
46 *
47 * Else, the first byte determines how many continuation bytes are
48 * expected. If there are fewer, the sequence is invalid, and @end is
49 * set to @s + 1 + actual number of continuation bytes. Else, the
50 * sequence is well-formed, and @end is set to @s + 1 + expected
51 * number of continuation bytes.
52 *
53 * A well-formed sequence is valid unless it encodes a codepoint
54 * outside the Unicode range U+0000..U+10FFFF, one of Unicode's 66
55 * noncharacters, a surrogate codepoint, or is overlong. Except the
56 * overlong sequence "\xC0\x80" is valid.
57 *
58 * Conversion succeeds if and only if the sequence is valid.
59 *
60 * Returns: the Unicode codepoint on success, -1 on failure.
61 */
62int mod_utf8_codepoint(const char *s, size_t n, char **end)
63{
64 static int min_cp[5] = { 0x80, 0x800, 0x10000, 0x200000, 0x4000000 };
65 const unsigned char *p;
66 unsigned byte, mask, len, i;
67 int cp;
68
69 if (n == 0 || *s == 0) {
70 /* empty sequence */
71 *end = (char *)s;
72 return -1;
73 }
74
75 p = (const unsigned char *)s;
76 byte = *p++;
77 if (byte < 0x80) {
78 cp = byte; /* one byte sequence */
79 } else if (byte >= 0xFE) {
80 cp = -1; /* impossible bytes 0xFE, 0xFF */
81 } else if ((byte & 0x40) == 0) {
82 cp = -1; /* unexpected continuation byte */
83 } else {
84 /* multi-byte sequence */
85 len = 0;
86 for (mask = 0x80; byte & mask; mask >>= 1) {
87 len++;
88 }
89 assert(len > 1 && len < 7);
90 cp = byte & (mask - 1);
91 for (i = 1; i < len; i++) {
92 byte = i < n ? *p : 0;
93 if ((byte & 0xC0) != 0x80) {
94 cp = -1; /* continuation byte missing */
95 goto out;
96 }
97 p++;
98 cp <<= 6;
99 cp |= byte & 0x3F;
100 }
Markus Armbrustere59f39d2018-08-23 18:39:49 +0200101 if (!is_valid_codepoint(cp)) {
102 cp = -1;
Markus Armbrustercb2744e2013-04-11 18:07:18 +0200103 } else if (cp < min_cp[len - 2] && !(cp == 0 && len == 2)) {
104 cp = -1; /* overlong, not \xC0\x80 */
105 }
106 }
107
108out:
109 *end = (char *)p;
110 return cp;
111}
Markus Armbrustere59f39d2018-08-23 18:39:49 +0200112
113/**
114 * mod_utf8_encode:
115 * @buf: Destination buffer
116 * @bufsz: size of @buf, at least 5.
117 * @codepoint: Unicode codepoint to encode
118 *
119 * Convert Unicode codepoint @codepoint to modified UTF-8.
120 *
121 * Returns: the length of the UTF-8 sequence on success, -1 when
122 * @codepoint is invalid.
123 */
124ssize_t mod_utf8_encode(char buf[], size_t bufsz, int codepoint)
125{
126 assert(bufsz >= 5);
127
128 if (!is_valid_codepoint(codepoint)) {
129 return -1;
130 }
131
132 if (codepoint > 0 && codepoint <= 0x7F) {
133 buf[0] = codepoint & 0x7F;
134 buf[1] = 0;
135 return 1;
136 }
137 if (codepoint <= 0x7FF) {
138 buf[0] = 0xC0 | ((codepoint >> 6) & 0x1F);
139 buf[1] = 0x80 | (codepoint & 0x3F);
140 buf[2] = 0;
141 return 2;
142 }
143 if (codepoint <= 0xFFFF) {
144 buf[0] = 0xE0 | ((codepoint >> 12) & 0x0F);
145 buf[1] = 0x80 | ((codepoint >> 6) & 0x3F);
146 buf[2] = 0x80 | (codepoint & 0x3F);
147 buf[3] = 0;
148 return 3;
149 }
150 buf[0] = 0xF0 | ((codepoint >> 18) & 0x07);
151 buf[1] = 0x80 | ((codepoint >> 12) & 0x3F);
152 buf[2] = 0x80 | ((codepoint >> 6) & 0x3F);
153 buf[3] = 0x80 | (codepoint & 0x3F);
154 buf[4] = 0;
155 return 4;
156}