Files
second-movement/filesystem/unicode.c

190 lines
6.3 KiB
C

/*
* USB mass storage class driver that mimics littlefs to FAT12 file system.
*
* Copyright 2024, Hiroyuki OYAMA. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
* - Neither the name of the copyright holder nor the names of its contributors may
* be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "unicode.h"
size_t strlen_utf8(const char *src) {
size_t count = 0;
size_t i = 0;
size_t src_size = strlen(src);
while (i < src_size) {
uint8_t byte = src[i];
if ((byte & 0x80) == 0) { // 1-byte UTF-8
count++;
} else if ((byte & 0xE0) == 0xC0) { // 2-byte UTF-8
count++;
i++; // Skip the continuation byte
} else if ((byte & 0xF0) == 0xE0) { // 3-byte UTF-8
count++;
i += 2; // Skip the continuation bytes
} else if ((byte & 0xF8) == 0xF0) { // 4-byte UTF-8
count++;
i += 3; // Skip the continuation bytes
} else {
return -1; // Invalid UTF-8 byte
}
i++;
}
return count;
}
size_t ascii_to_utf16le(uint16_t *dist, size_t dist_size, const char *src, size_t src_size) {
size_t utf16le_pos = 0;
for (size_t i = 0; i < src_size && src[i] != '\0'; ++i) {
uint32_t codepoint = (uint32_t)src[i];
if (utf16le_pos + 1 <= dist_size) {
dist[utf16le_pos++] = (uint16_t)codepoint;
} else {
break;
}
}
if (utf16le_pos < dist_size) {
dist[utf16le_pos] = '\0';
}
return utf16le_pos;
}
// Convert UTF-8 to UTF-16LE and return the length of the converted string
size_t utf8_to_utf16le(uint16_t* dist, size_t dist_size, const char *src, size_t src_size) {
size_t dist_pos = 0;
size_t src_pos = 0;
while (src_pos < src_size && dist_pos < dist_size) {
uint32_t codepoint = 0;
size_t extra_bytes = 0;
uint8_t byte = src[src_pos];
// Determine the number of bytes for the UTF-8 codepoint
if ((byte & 0x80) == 0) { // 1-byte UTF-8
codepoint = byte;
} else if ((byte & 0xE0) == 0xC0) { // 2-byte UTF-8
codepoint = byte & 0x1F;
extra_bytes = 1;
} else if ((byte & 0xF0) == 0xE0) { // 3-byte UTF-8
codepoint = byte & 0x0F;
extra_bytes = 2;
} else if ((byte & 0xF8) == 0xF0) { // 4-byte UTF-8
codepoint = byte & 0x07;
extra_bytes = 3;
} else {
// Invalid UTF-8 byte
return -1; // Return -1 to indicate an error
}
// Calculate the complete codepoint
for (size_t j = 0; j < extra_bytes; ++j) {
src_pos++;
if (src_pos >= src_size) {
return -1; // Incomplete UTF-8 sequence
}
byte = src[src_pos];
if ((byte & 0xC0) != 0x80) {
return -1; // Invalid UTF-8 continuation byte
}
codepoint = (codepoint << 6) | (byte & 0x3F);
}
// Convert to UTF-16LE
if (codepoint <= 0xFFFF) { // Basic Multilingual Plane
if (dist_pos < dist_size) {
dist[dist_pos++] = (uint16_t)codepoint;
}
} else { // Supplementary Planes (surrogates)
codepoint -= 0x10000;
if (dist_pos + 1 < dist_size) {
dist[dist_pos++] = 0xD800 | ((codepoint >> 10) & 0x3FF);
dist[dist_pos++] = 0xDC00 | (codepoint & 0x3FF);
dist_pos += 2;
} else {
return -1; // Not enough space for surrogates
}
}
src_pos++;
}
if (dist_pos < dist_size) {
dist[dist_pos] = 0; // Null-terminate
}
return dist_pos;
}
size_t utf16le_to_utf8(char *dist, size_t buffer_size, const uint16_t *src, size_t len) {
size_t dist_len = 0;
for (size_t i = 0; i < len; ++i) {
uint32_t codepoint = src[i];
if (codepoint == 0xFFFF) {
break;
}
if (codepoint <= 0x7F) {
if (dist_len + 1 <= buffer_size) {
dist[dist_len++] = (uint8_t)codepoint;
} else {
break;
}
} else if (codepoint <= 0x7FF) {
if (dist_len + 2 <= buffer_size) {
dist[dist_len++] = (uint8_t)(0xC0 | (codepoint >> 6));
dist[dist_len++] = (uint8_t)(0x80 | (codepoint & 0x3F));
} else {
break;
}
} else if (codepoint <= 0xFFFF) {
if (dist_len + 3 <= buffer_size) {
dist[dist_len++] = (uint8_t)(0xE0 | (codepoint >> 12));
dist[dist_len++] = (uint8_t)(0x80 | ((codepoint >> 6) & 0x3F));
dist[dist_len++] = (uint8_t)(0x80 | (codepoint & 0x3F));
} else {
break;
}
} else {
break;
}
}
if (dist_len < buffer_size) {
dist[dist_len] = '\0';
}
return dist_len;
}