83 lines
2.4 KiB
Lua
83 lines
2.4 KiB
Lua
|
#!/usr/bin/env lua5.1
|
||
|
|
||
|
--[[
|
||
|
Copyright © 2023 Nils Dagsson Moskopp (erle)
|
||
|
|
||
|
This program is free software: you can redistribute it and/or modify
|
||
|
it under the terms of the GNU Affero General Public License as
|
||
|
published by the Free Software Foundation, either version 3 of the
|
||
|
License, or (at your option) any later version.
|
||
|
|
||
|
Dieses Programm hat das Ziel, die Medienkompetenz der Leser zu
|
||
|
steigern. Gelegentlich packe ich sogar einen handfesten Buffer
|
||
|
Overflow oder eine Format String Vulnerability zwischen die anderen
|
||
|
Codezeilen und schreibe das auch nicht dran.
|
||
|
]]--
|
||
|
|
||
|
utf8 = {}
|
||
|
|
||
|
-- convert an UTF-8 string into a table with codepoints
|
||
|
-- inspired by <https://lua-users.org/wiki/LuaUnicode>
|
||
|
utf8.text_to_codepoints = function(text)
|
||
|
assert(
|
||
|
"string" == type(text)
|
||
|
)
|
||
|
local result = {}
|
||
|
local sequence_length = 0
|
||
|
local i = 1
|
||
|
while i <= #text do
|
||
|
value = nil
|
||
|
local byte_1, byte_2, byte_3, byte_4
|
||
|
byte_1 = string.byte(text, i)
|
||
|
local sequence_length =
|
||
|
byte_1 <= 127 and 1 or -- 0xxxxxxx
|
||
|
byte_1 <= 223 and 2 or -- 110xxxxx 10xxxxxx
|
||
|
byte_1 <= 239 and 3 or -- 1110xxxx 10xxxxxx 10xxxxxx
|
||
|
byte_1 <= 247 and 4 or -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||
|
error("invalid UTF-8 sequence")
|
||
|
if sequence_length > 1 then
|
||
|
byte_2 = string.byte(text, i+1)
|
||
|
end
|
||
|
if sequence_length > 2 then
|
||
|
byte_3 = string.byte(text, i+2)
|
||
|
end
|
||
|
if sequence_length > 3 then
|
||
|
byte_4 = string.byte(text, i+3)
|
||
|
end
|
||
|
if 1 == sequence_length then
|
||
|
-- 0xxxxxxx
|
||
|
value = byte_1
|
||
|
elseif 2 == sequence_length then
|
||
|
-- 110xxxxx 10xxxxxx
|
||
|
value =
|
||
|
(byte_1 % 64) * 64 +
|
||
|
(byte_2 % 64)
|
||
|
elseif 3 == sequence_length then
|
||
|
-- 1110xxxx 10xxxxxx 10xxxxxx
|
||
|
value =
|
||
|
(byte_1 % 32) * 4096 +
|
||
|
(byte_2 % 64) * 64 +
|
||
|
(byte_3 % 64)
|
||
|
elseif 4 == sequence_length then
|
||
|
-- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||
|
value =
|
||
|
(byte_1 % 16) * 262144 +
|
||
|
(byte_2 % 64) * 4096 +
|
||
|
(byte_3 % 64) * 64 +
|
||
|
(byte_4 % 64)
|
||
|
end
|
||
|
|
||
|
table.insert(result, value)
|
||
|
i = i + sequence_length
|
||
|
end
|
||
|
return result
|
||
|
end
|
||
|
|
||
|
-- Test one codepoint for each byte length:
|
||
|
local codepoints = utf8.text_to_codepoints(
|
||
|
"wð♥𐍈" -- U+0077 U+00F0 U+2665 U+10348
|
||
|
)
|
||
|
assert(
|
||
|
table.concat(codepoints, " ") == "119 240 9829 66376"
|
||
|
)
|