lib: charset: utility functions for Unicode
utf8_get() - get next UTF-8 code point from buffer utf8_put() - write UTF-8 code point to buffer utf8_utf16_strnlen() - length of a utf-8 string after conversion to utf-16 utf8_utf16_strncpy() - copy a utf-8 string to utf-16 utf16_get() - get next UTF-16 code point from buffer utf16_put() - write UTF-16 code point to buffer utf16_strnlen() - number of codes points in a utf-16 string utf16_utf8_strnlen() - length of a utf-16 string after conversion to utf-8 utf16_utf8_strncpy() - copy a utf-16 string to utf-8 Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de> Signed-off-by: Alexander Graf <agraf@suse.de>
This commit is contained in:
		
							parent
							
								
									1dde0d57a5
								
							
						
					
					
						commit
						d8c28232c3
					
				|  | @ -8,10 +8,140 @@ | |||
| #ifndef __CHARSET_H_ | ||||
| #define __CHARSET_H_ | ||||
| 
 | ||||
| #include <linux/kernel.h> | ||||
| #include <linux/types.h> | ||||
| 
 | ||||
| #define MAX_UTF8_PER_UTF16 3 | ||||
| 
 | ||||
| /**
 | ||||
|  * utf8_get() - get next UTF-8 code point from buffer | ||||
|  * | ||||
|  * @src:		pointer to current byte, updated to point to next byte | ||||
|  * Return:		code point, or 0 for end of string, or -1 if no legal | ||||
|  *			code point is found. In case of an error src points to | ||||
|  *			the incorrect byte. | ||||
|  */ | ||||
| s32 utf8_get(const char **src); | ||||
| 
 | ||||
| /**
 | ||||
|  * utf8_put() - write UTF-8 code point to buffer | ||||
|  * | ||||
|  * @code:		code point | ||||
|  * @dst:		pointer to destination buffer, updated to next position | ||||
|  * Return:		-1 if the input parameters are invalid | ||||
|  */ | ||||
| int utf8_put(s32 code, char **dst); | ||||
| 
 | ||||
| /**
 | ||||
|  * utf8_utf16_strnlen() - length of a truncated utf-8 string after conversion | ||||
|  *			  to utf-16 | ||||
|  * | ||||
|  * @src:		utf-8 string | ||||
|  * @count:		maximum number of code points to convert | ||||
|  * Return:		length in bytes after conversion to utf-16 without the | ||||
|  *			trailing \0. If an invalid UTF-8 sequence is hit one | ||||
|  *			word will be reserved for a replacement character. | ||||
|  */ | ||||
| size_t utf8_utf16_strnlen(const char *src, size_t count); | ||||
| 
 | ||||
| /**
 | ||||
|  * utf8_utf16_strlen() - length of a utf-8 string after conversion to utf-16 | ||||
|  * | ||||
|  * @src:		utf-8 string | ||||
|  * Return:		length in bytes after conversion to utf-16 without the | ||||
|  *			trailing \0. -1 if the utf-8 string is not valid. | ||||
|  */ | ||||
| #define utf8_utf16_strlen(a) utf8_utf16_strnlen((a), SIZE_MAX) | ||||
| 
 | ||||
| /**
 | ||||
|  * utf8_utf16_strncpy() - copy utf-8 string to utf-16 string | ||||
|  * | ||||
|  * @dst:		destination buffer | ||||
|  * @src:		source buffer | ||||
|  * @count:		maximum number of code points to copy | ||||
|  * Return:		-1 if the input parameters are invalid | ||||
|  */ | ||||
| int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count); | ||||
| 
 | ||||
| /**
 | ||||
|  * utf8_utf16_strcpy() - copy utf-8 string to utf-16 string | ||||
|  * | ||||
|  * @dst:		destination buffer | ||||
|  * @src:		source buffer | ||||
|  * Return:		-1 if the input parameters are invalid | ||||
|  */ | ||||
| #define utf8_utf16_strcpy(d, s) utf8_utf16_strncpy((d), (s), SIZE_MAX) | ||||
| 
 | ||||
| /**
 | ||||
|  * utf16_get() - get next UTF-16 code point from buffer | ||||
|  * | ||||
|  * @src:		pointer to current word, updated to point to next word | ||||
|  * Return:		code point, or 0 for end of string, or -1 if no legal | ||||
|  *			code point is found. In case of an error src points to | ||||
|  *			the incorrect word. | ||||
|  */ | ||||
| s32 utf16_get(const u16 **src); | ||||
| 
 | ||||
| /**
 | ||||
|  * utf16_put() - write UTF-16 code point to buffer | ||||
|  * | ||||
|  * @code:		code point | ||||
|  * @dst:		pointer to destination buffer, updated to next position | ||||
|  * Return:		-1 if the input parameters are invalid | ||||
|  */ | ||||
| int utf16_put(s32 code, u16 **dst); | ||||
| 
 | ||||
| /**
 | ||||
|  * utf16_strnlen() - length of a truncated utf-16 string | ||||
|  * | ||||
|  * @src:		utf-16 string | ||||
|  * @count:		maximum number of code points to convert | ||||
|  * Return:		length in code points. If an invalid UTF-16 sequence is | ||||
|  *			hit one position will be reserved for a replacement | ||||
|  *			character. | ||||
|  */ | ||||
| size_t utf16_strnlen(const u16 *src, size_t count); | ||||
| 
 | ||||
| /**
 | ||||
|  * utf16_utf8_strnlen() - length of a truncated utf-16 string after conversion | ||||
|  *			  to utf-8 | ||||
|  * | ||||
|  * @src:		utf-16 string | ||||
|  * @count:		maximum number of code points to convert | ||||
|  * Return:		length in bytes after conversion to utf-8 without the | ||||
|  *			trailing \0. If an invalid UTF-16 sequence is hit one | ||||
|  *			byte will be reserved for a replacement character. | ||||
|  */ | ||||
| size_t utf16_utf8_strnlen(const u16 *src, size_t count); | ||||
| 
 | ||||
| /**
 | ||||
|  * utf16_utf8_strlen() - length of a utf-16 string after conversion to utf-8 | ||||
|  * | ||||
|  * @src:		utf-16 string | ||||
|  * Return:		length in bytes after conversion to utf-8 without the | ||||
|  *			trailing \0. -1 if the utf-16 string is not valid. | ||||
|  */ | ||||
| #define utf16_utf8_strlen(a) utf16_utf8_strnlen((a), SIZE_MAX) | ||||
| 
 | ||||
| /**
 | ||||
|  * utf16_utf8_strncpy() - copy utf-16 string to utf-8 string | ||||
|  * | ||||
|  * @dst:		destination buffer | ||||
|  * @src:		source buffer | ||||
|  * @count:		maximum number of code points to copy | ||||
|  * Return:		-1 if the input parameters are invalid | ||||
|  */ | ||||
| int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count); | ||||
| 
 | ||||
| /**
 | ||||
|  * utf16_utf8_strcpy() - copy utf-16 string to utf-8 string | ||||
|  * | ||||
|  * @dst:		destination buffer | ||||
|  * @src:		source buffer | ||||
|  * Return:		-1 if the input parameters are invalid | ||||
|  */ | ||||
| #define utf16_utf8_strcpy(d, s) utf16_utf8_strncpy((d), (s), SIZE_MAX) | ||||
| 
 | ||||
| /**
 | ||||
|  * u16_strlen - count non-zero words | ||||
|  * | ||||
|  |  | |||
							
								
								
									
										232
									
								
								lib/charset.c
								
								
								
								
							
							
						
						
									
										232
									
								
								lib/charset.c
								
								
								
								
							|  | @ -8,9 +8,239 @@ | |||
| #include <charset.h> | ||||
| #include <malloc.h> | ||||
| 
 | ||||
| s32 utf8_get(const char **src) | ||||
| { | ||||
| 	s32 code = 0; | ||||
| 	unsigned char c; | ||||
| 
 | ||||
| 	if (!src || !*src) | ||||
| 		return -1; | ||||
| 	if (!**src) | ||||
| 		return 0; | ||||
| 	c = **src; | ||||
| 	if (c >= 0x80) { | ||||
| 		++*src; | ||||
| 		if (!**src) | ||||
| 			return -1; | ||||
| 		/*
 | ||||
|  * utf8/utf16 conversion mostly lifted from grub | ||||
| 		 * We do not expect a continuation byte (0x80 - 0xbf). | ||||
| 		 * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2 | ||||
| 		 * here. | ||||
| 		 * The highest code point is 0x10ffff which is coded as | ||||
| 		 * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4. | ||||
| 		 */ | ||||
| 		if (c < 0xc2 || code > 0xf4) | ||||
| 			return -1; | ||||
| 		if (c >= 0xe0) { | ||||
| 			if (c >= 0xf0) { | ||||
| 				/* 0xf0 - 0xf4 */ | ||||
| 				c &= 0x07; | ||||
| 				code = c << 18; | ||||
| 				c = **src; | ||||
| 				++*src; | ||||
| 				if (!**src) | ||||
| 					return -1; | ||||
| 				if (c < 0x80 || c > 0xbf) | ||||
| 					return -1; | ||||
| 				c &= 0x3f; | ||||
| 			} else { | ||||
| 				/* 0xe0 - 0xef */ | ||||
| 				c &= 0x0f; | ||||
| 			} | ||||
| 			code += c << 12; | ||||
| 			if ((code >= 0xD800 && code <= 0xDFFF) || | ||||
| 			    code >= 0x110000) | ||||
| 				return -1; | ||||
| 			c = **src; | ||||
| 			++*src; | ||||
| 			if (!**src) | ||||
| 				return -1; | ||||
| 			if (c < 0x80 || c > 0xbf) | ||||
| 				return -1; | ||||
| 		} | ||||
| 		/* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */ | ||||
| 		c &= 0x3f; | ||||
| 		code += c << 6; | ||||
| 		c = **src; | ||||
| 		if (c < 0x80 || c > 0xbf) | ||||
| 			return -1; | ||||
| 		c &= 0x3f; | ||||
| 	} | ||||
| 	code += c; | ||||
| 	++*src; | ||||
| 	return code; | ||||
| } | ||||
| 
 | ||||
| int utf8_put(s32 code, char **dst) | ||||
| { | ||||
| 	if (!dst || !*dst) | ||||
| 		return -1; | ||||
| 	if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) | ||||
| 		return -1; | ||||
| 	if (code <= 0x007F) { | ||||
| 		**dst = code; | ||||
| 	} else { | ||||
| 		if (code <= 0x07FF) { | ||||
| 			**dst = code >> 6 | 0xC0; | ||||
| 		} else { | ||||
| 			if (code < 0x10000) { | ||||
| 				**dst = code >> 12 | 0xE0; | ||||
| 			} else { | ||||
| 				**dst = code >> 18 | 0xF0; | ||||
| 				++*dst; | ||||
| 				**dst = (code >> 12 & 0x3F) | 0x80; | ||||
| 			} | ||||
| 			++*dst; | ||||
| 			**dst = (code >> 6 & 0x3F) | 0x80; | ||||
| 		} | ||||
| 		++*dst; | ||||
| 		**dst = (code & 0x3F) | 0x80; | ||||
| 	} | ||||
| 	++*dst; | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| size_t utf8_utf16_strnlen(const char *src, size_t count) | ||||
| { | ||||
| 	size_t len = 0; | ||||
| 
 | ||||
| 	for (; *src && count; --count)  { | ||||
| 		s32 code = utf8_get(&src); | ||||
| 
 | ||||
| 		if (!code) | ||||
| 			break; | ||||
| 		if (code < 0) { | ||||
| 			/* Reserve space for a replacement character */ | ||||
| 			len += 1; | ||||
| 		} else if (code < 0x10000) { | ||||
| 			len += 1; | ||||
| 		} else { | ||||
| 			len += 2; | ||||
| 		} | ||||
| 	} | ||||
| 	return len; | ||||
| } | ||||
| 
 | ||||
| int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count) | ||||
| { | ||||
| 	if (!src || !dst || !*dst) | ||||
| 		return -1; | ||||
| 
 | ||||
| 	for (; count && *src; --count) { | ||||
| 		s32 code = utf8_get(&src); | ||||
| 
 | ||||
| 		if (code < 0) | ||||
| 			code = '?'; | ||||
| 		utf16_put(code, dst); | ||||
| 	} | ||||
| 	**dst = 0; | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| s32 utf16_get(const u16 **src) | ||||
| { | ||||
| 	s32 code, code2; | ||||
| 
 | ||||
| 	if (!src || !*src) | ||||
| 		return -1; | ||||
| 	if (!**src) | ||||
| 		return 0; | ||||
| 	code = **src; | ||||
| 	++*src; | ||||
| 	if (code >= 0xDC00 && code <= 0xDFFF) | ||||
| 		return -1; | ||||
| 	if (code >= 0xD800 && code <= 0xDBFF) { | ||||
| 		if (!**src) | ||||
| 			return -1; | ||||
| 		code &= 0x3ff; | ||||
| 		code <<= 10; | ||||
| 		code += 0x10000; | ||||
| 		code2 = **src; | ||||
| 		++*src; | ||||
| 		if (code2 <= 0xDC00 || code2 >= 0xDFFF) | ||||
| 			return -1; | ||||
| 		code2 &= 0x3ff; | ||||
| 		code += code2; | ||||
| 	} | ||||
| 	return code; | ||||
| } | ||||
| 
 | ||||
| int utf16_put(s32 code, u16 **dst) | ||||
| { | ||||
| 	if (!dst || !*dst) | ||||
| 		return -1; | ||||
| 	if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) | ||||
| 		return -1; | ||||
| 	if (code < 0x10000) { | ||||
| 		**dst = code; | ||||
| 	} else { | ||||
| 		code -= 0x10000; | ||||
| 		**dst = code >> 10 | 0xD800; | ||||
| 		++*dst; | ||||
| 		**dst = (code & 0x3ff) | 0xDC00; | ||||
| 	} | ||||
| 	++*dst; | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| size_t utf16_strnlen(const u16 *src, size_t count) | ||||
| { | ||||
| 	size_t len = 0; | ||||
| 
 | ||||
| 	for (; *src && count; --count)  { | ||||
| 		s32 code = utf16_get(&src); | ||||
| 
 | ||||
| 		if (!code) | ||||
| 			break; | ||||
| 		/*
 | ||||
| 		 * In case of an illegal sequence still reserve space for a | ||||
| 		 * replacement character. | ||||
| 		 */ | ||||
| 		++len; | ||||
| 	} | ||||
| 	return len; | ||||
| } | ||||
| 
 | ||||
| size_t utf16_utf8_strnlen(const u16 *src, size_t count) | ||||
| { | ||||
| 	size_t len = 0; | ||||
| 
 | ||||
| 	for (; *src && count; --count)  { | ||||
| 		s32 code = utf16_get(&src); | ||||
| 
 | ||||
| 		if (!code) | ||||
| 			break; | ||||
| 		if (code < 0) | ||||
| 			/* Reserve space for a replacement character */ | ||||
| 			len += 1; | ||||
| 		else if (code < 0x80) | ||||
| 			len += 1; | ||||
| 		else if (code < 0x800) | ||||
| 			len += 2; | ||||
| 		else if (code < 0x10000) | ||||
| 			len += 3; | ||||
| 		else | ||||
| 			len += 4; | ||||
| 	} | ||||
| 	return len; | ||||
| } | ||||
| 
 | ||||
| int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count) | ||||
| { | ||||
| 	if (!src || !dst || !*dst) | ||||
| 		return -1; | ||||
| 
 | ||||
| 	for (; count && *src; --count) { | ||||
| 		s32 code = utf16_get(&src); | ||||
| 
 | ||||
| 		if (code < 0) | ||||
| 			code = '?'; | ||||
| 		utf8_put(code, dst); | ||||
| 	} | ||||
| 	**dst = 0; | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| size_t u16_strlen(const u16 *in) | ||||
| { | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue