unicode_text/utf8.lua

145 lines
4.2 KiB
Lua
Raw Permalink Normal View History

2023-03-17 03:39:41 +01:00
#!/usr/bin/env lua5.1
--[[
Copyright © 2023 Nils Dagsson Moskopp (erle)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
Dieses Programm hat das Ziel, die Medienkompetenz der Leser zu
steigern. Gelegentlich packe ich sogar einen handfesten Buffer
Overflow oder eine Format String Vulnerability zwischen die anderen
Codezeilen und schreibe das auch nicht dran.
]]--
utf8 = {}
-- convert a table with codepoints into an UTF-8 string
-- inspired by <http://news.dieweltistgarnichtso.net/bin/unicode>
utf8.codepoints_to_text = function(codepoints)
assert(
"table" == type(codepoints)
)
for _, codepoint in ipairs(codepoints) do
if (
0 >= codepoint or
1114111 < codepoint or
math.floor(codepoint) ~= codepoint
) then
error(
string.format(
"invalid codepoint: %s",
codepoint
)
)
end
end
local codepoints_encoded = {}
codepoints_encoded.append = function(...)
codepoints_encoded[#codepoints_encoded+1] = string.char(...)
end
for _, codepoint in ipairs(codepoints) do
if codepoint <= 127 then
-- one byte encoding
codepoints_encoded.append(codepoint)
elseif codepoint <= 2048 then
-- two bytes encoding
codepoints_encoded.append(
math.floor(codepoint / 64) + 192,
codepoint % 64 + 128
)
elseif codepoint <= 65535 then
-- three bytes encoding
codepoints_encoded.append(
math.floor(codepoint / 4096) + 224,
math.floor(codepoint / 64) % 64 + 128,
codepoint % 64 + 128
)
elseif codepoint <= 1114111 then
-- four bytes encoding
codepoints_encoded.append(
math.floor(codepoint / 262144) + 240,
math.floor(codepoint / 4096) % 64 + 128,
math.floor(codepoint / 64) % 64 + 128,
codepoint % 64 + 128
)
end
end
return table.concat(codepoints_encoded)
end
-- Test one codepoint for each byte length:
local text = utf8.codepoints_to_text(
{119, 240, 9829, 66376} -- U+0077 U+00F0 U+2665 U+10348
)
assert(
"wð♥𐍈" == text
)
2023-03-17 03:39:41 +01:00
-- convert an UTF-8 string into a table with codepoints
-- inspired by <https://lua-users.org/wiki/LuaUnicode>
utf8.text_to_codepoints = function(text)
assert(
"string" == type(text)
)
local result = {}
local sequence_length = 0
local i = 1
while i <= #text do
2023-09-04 20:26:37 +02:00
local value = nil
2023-03-17 03:39:41 +01:00
local byte_1, byte_2, byte_3, byte_4
byte_1 = string.byte(text, i)
local sequence_length =
byte_1 <= 127 and 1 or -- 0xxxxxxx
byte_1 <= 223 and 2 or -- 110xxxxx 10xxxxxx
byte_1 <= 239 and 3 or -- 1110xxxx 10xxxxxx 10xxxxxx
byte_1 <= 247 and 4 or -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
error("invalid UTF-8 sequence")
if sequence_length > 1 then
byte_2 = string.byte(text, i+1)
end
if sequence_length > 2 then
byte_3 = string.byte(text, i+2)
end
if sequence_length > 3 then
byte_4 = string.byte(text, i+3)
end
if 1 == sequence_length then
-- 0xxxxxxx
value = byte_1
elseif 2 == sequence_length then
-- 110xxxxx 10xxxxxx
value =
(byte_1 % 64) * 64 +
(byte_2 % 64)
elseif 3 == sequence_length then
-- 1110xxxx 10xxxxxx 10xxxxxx
value =
(byte_1 % 32) * 4096 +
(byte_2 % 64) * 64 +
(byte_3 % 64)
elseif 4 == sequence_length then
-- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
value =
(byte_1 % 16) * 262144 +
(byte_2 % 64) * 4096 +
(byte_3 % 64) * 64 +
(byte_4 % 64)
end
table.insert(result, value)
i = i + sequence_length
end
return result
end
-- Test one codepoint for each byte length:
local codepoints = utf8.text_to_codepoints(
"wð♥𐍈" -- U+0077 U+00F0 U+2665 U+10348
)
assert(
table.concat(codepoints, " ") == "119 240 9829 66376"
)