unicode_text/utf8.lua

83 lines
2.4 KiB
Lua
Raw Normal View History

2023-03-17 03:39:41 +01:00
#!/usr/bin/env lua5.1
--[[
Copyright © 2023 Nils Dagsson Moskopp (erle)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
Dieses Programm hat das Ziel, die Medienkompetenz der Leser zu
steigern. Gelegentlich packe ich sogar einen handfesten Buffer
Overflow oder eine Format String Vulnerability zwischen die anderen
Codezeilen und schreibe das auch nicht dran.
]]--
utf8 = {}
-- convert an UTF-8 string into a table with codepoints
-- inspired by <https://lua-users.org/wiki/LuaUnicode>
utf8.text_to_codepoints = function(text)
assert(
"string" == type(text)
)
local result = {}
local sequence_length = 0
local i = 1
while i <= #text do
value = nil
local byte_1, byte_2, byte_3, byte_4
byte_1 = string.byte(text, i)
local sequence_length =
byte_1 <= 127 and 1 or -- 0xxxxxxx
byte_1 <= 223 and 2 or -- 110xxxxx 10xxxxxx
byte_1 <= 239 and 3 or -- 1110xxxx 10xxxxxx 10xxxxxx
byte_1 <= 247 and 4 or -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
error("invalid UTF-8 sequence")
if sequence_length > 1 then
byte_2 = string.byte(text, i+1)
end
if sequence_length > 2 then
byte_3 = string.byte(text, i+2)
end
if sequence_length > 3 then
byte_4 = string.byte(text, i+3)
end
if 1 == sequence_length then
-- 0xxxxxxx
value = byte_1
elseif 2 == sequence_length then
-- 110xxxxx 10xxxxxx
value =
(byte_1 % 64) * 64 +
(byte_2 % 64)
elseif 3 == sequence_length then
-- 1110xxxx 10xxxxxx 10xxxxxx
value =
(byte_1 % 32) * 4096 +
(byte_2 % 64) * 64 +
(byte_3 % 64)
elseif 4 == sequence_length then
-- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
value =
(byte_1 % 16) * 262144 +
(byte_2 % 64) * 4096 +
(byte_3 % 64) * 64 +
(byte_4 % 64)
end
table.insert(result, value)
i = i + sequence_length
end
return result
end
-- Test one codepoint for each byte length:
local codepoints = utf8.text_to_codepoints(
"wð♥𐍈" -- U+0077 U+00F0 U+2665 U+10348
)
assert(
table.concat(codepoints, " ") == "119 240 9829 66376"
)