Compare commits

...

10 Commits

Author SHA1 Message Date
Nils Dagsson Moskopp 78a13f7490
+ Modify test for bidi rule W2 so it passes 2023-09-02 15:20:42 +02:00
Nils Dagsson Moskopp d7ebaec797 * Refactor tests for bidi rules W2, W3, W4 2023-03-26 21:29:32 +02:00
Nils Dagsson Moskopp 732b063dcd + Add tests for bidi rule W4 2023-03-26 20:46:38 +02:00
Nils Dagsson Moskopp fb971deccf + Add tests for bidi rule W3 2023-03-26 20:42:55 +02:00
Nils Dagsson Moskopp 58bc44bc7c + Add tests for bidi rule W2 2023-03-26 20:41:52 +02:00
Nils Dagsson Moskopp 4d8bea08fd * Refactor bidi.lua (resolving weak types) 2023-03-22 18:22:17 +01:00
Nils Dagsson Moskopp 82d04cee6f + Add more example texts 2023-03-22 03:44:11 +01:00
Nils Dagsson Moskopp 792276cbb5 * Reorder example text 2023-03-22 03:15:33 +01:00
Nils Dagsson Moskopp 759a7d008f + Add rudimentary bidirectional rendering support 2023-03-22 02:59:44 +01:00
Nils Dagsson Moskopp 9b984e0d2f * Refactor newline handling to handle CR, NEL, PSEP 2023-03-21 17:24:44 +01:00
5 changed files with 804 additions and 28 deletions

View File

@ -223,10 +223,15 @@ Hint 1: <https://unifoundry.com/unifont/index.html>
Hint 2: <https://trevorldavis.com/R/hexfont/>
Why is Arabic / Hebrew / Urdu etc. text rendered wrong, i.e. left to right?
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Why is Arabic / Hebrew / Urdu etc. text rendered somewhat wrong?
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
I did not implement the Unicode Bidirectional Algorithm. Patches welcome.
I did not implement the entire `Unicode Bidirectional Algorithm`_.
If you are able to read a right-to-left language, please help …
.. _`Unicode Bidirectional Algorithm`:
https://www.unicode.org/reports/tr9/
Why is the generated pixels table upside down?
++++++++++++++++++++++++++++++++++++++++++++++

644
bidi.lua Normal file
View File

@ -0,0 +1,644 @@
#!/usr/bin/env lua5.1
--[[
Copyright © 2023 Nils Dagsson Moskopp (erle)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
Dieses Programm hat das Ziel, die Medienkompetenz der Leser zu
steigern. Gelegentlich packe ich sogar einen handfesten Buffer
Overflow oder eine Format String Vulnerability zwischen die anderen
Codezeilen und schreibe das auch nicht dran.
]]--
dofile("unicodedata.lua")
bidi = {}
local get_paragraph_direction = function(codepoints)
-- Find the first character of type L, AL or R …
-- See <https://www.unicode.org/reports/tr9/#P2>
for _, codepoint in ipairs(codepoints) do
if unicodedata[codepoint] then
local bidi_class = unicodedata[codepoint].bidi_class
if (
"L" == bidi_class or -- left-to-right
"R" == bidi_class or -- right-to-left
"AL" == bidi_class -- right-to-left (arabic)
) then
return bidi_class
end
end
end
end
-- See <https://www.unicode.org/reports/tr9/>
bidi.get_visual_reordering = function(codepoints)
-- rule P2
local paragraph_direction =
get_paragraph_direction(codepoints) or "L"
-- rule P3
local paragraph_embedding_level = 0
if (
"R" == paragraph_direction or
"AL" == paragraph_direction
) then
paragraph_embedding_level = 1
end
-- FIXME: Rule X1 to X10 are not implemented yet. This basically
-- means that explicit levels or display directions are ignored.
local run = {}
for i, codepoint in ipairs(codepoints) do
run[i] = {}
run[i].codepoint = codepoint
run[i].bidi_class =
unicodedata[codepoint] and
unicodedata[codepoint].bidi_class or paragraph_direction
run[i].embedding_level = 0
end
-- Hack: This code is almost certainly non-conforming …
-- but it seems to “kinda” work. Someone should fix it!
run = bidi.resolve_weak_types(run, paragraph_direction)
run = bidi.resolve_ni_types(run, paragraph_direction)
run = bidi.resolve_implicit_types(run, paragraph_embedding_level)
run = bidi.reorder_resolved_levels(run, paragraph_embedding_level)
codepoints_reordered = {}
for i, element in ipairs(run) do
codepoints_reordered[i] = element.codepoint
end
return codepoints_reordered
end
bidi.W1 = function(run, sos)
-- Examine each nonspacing mark (NSM) in the isolating run
-- sequence, and change the type of the NSM to Other Neutral
-- if the previous character is an isolate initiator or PDI,
-- and to the type of the previous character otherwise. If
-- the NSM is at the start of the isolating run sequence, it
-- will get the type of sos.
for i = 1, #run do
if "NSM" == run[i].bidi_class then
if 1 == i then
run[i].bidi_class = sos
else
-- FIXME: handle isolate initiator, PDI
run[i].bidi_class = run[i-1].bidi_class
end
end
end
return run
end
local W2 = function(run, sos)
-- sos is the text ordering type assigned to the virtual position
-- before an isolating run sequence
assert(
"AL" == sos or -- FIXME: find the actual bug & remove this line
"L" == sos or
"R" == sos
)
for i = 1, #run do
if "EN" == run[i].bidi_class then
-- Search backward from each instance of a European number
-- until the first strong type (R, L, AL, or sos) is found.
local previous_strong_bidi_class = nil
local j = i
repeat
j = j - 1
local previous_bidi_class = run[j].bidi_class
if (
"L" == previous_bidi_class or -- left-to-right
"R" == previous_bidi_class or -- right-to-left
"AL" == previous_bidi_class or -- right-to-left (arabic)
sos == previous_bidi_class
) then
previous_strong_bidi_class = previous_bidi_class
end
until(
nil ~= previous_strong_bidi_class or
1 == j
)
-- If an AL is found, change the type of the European number
-- to Arabic number.
if "AL" == previous_strong_bidi_class then
run[i].bidi_class = "AN"
end
end
end
return run
end
dump_table = function(table_, indentation)
assert( "table" == type(table_) )
local indentation = indentation or 0
assert( "number" == type(indentation) )
local result = {}
for key, value in pairs(table_) do
local prefix = "\n" .. (" "):rep(indentation) .. key .. ": "
if "table" == type(value) then
result[#result + 1] = prefix .. dump_table(value, indentation + 1)
else
result[#result + 1] = prefix .. tostring(value)
end
end
return table.concat(result)
end
local test_rule = function(
description,
rule,
test_input,
expected_output,
...
)
assert( "string" == type(description) )
assert( "function" == type(rule) )
assert( "table" == type(test_input) )
assert( "table" == type(expected_output) )
local test_output = rule(test_input, ...)
for i = 1, #test_input do
assert(
test_output[i].bidi_class == expected_output[i].bidi_class,
description ..
dump_table(test_output)
)
end
end
test_rule(
"Test W2: AL EN → AL AN",
W2,
{
{ ["bidi_class"] = "AL" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "AL" },
{ ["bidi_class"] = "AN" },
},
"L"
)
test_rule(
"Test W2: AL NI EN → AL NI AN",
W2,
{
{ ["bidi_class"] = "AL" },
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "AL" },
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "AN" },
},
"L"
)
test_rule(
"Test W2: sos NI EN → sos NI EN (sos = L)",
W2,
{
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
"L"
)
test_rule(
"Test W2: sos NI EN → sos NI EN (sos = R)",
W2,
{
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
"R"
)
test_rule(
"Test W2: L NI EN → L NI EN",
W2,
{
{ ["bidi_class"] = "L" },
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "L" },
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
"L"
)
test_rule(
"Test W2: R NI EN → R NI EN",
W2,
{
{ ["bidi_class"] = "R" },
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "R" },
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
"L"
)
local W3 = function(run)
-- Change al ALs to R.
for i = 1, #run do
if "AL" == run[i].bidi_class then
run[i].bidi_class = "R"
end
end
return run
end
test_rule(
"Test W3: AL AL AL → R R R",
W3,
{
{ ["bidi_class"] = "AL" },
{ ["bidi_class"] = "AL" },
{ ["bidi_class"] = "AL" },
},
{
{ ["bidi_class"] = "R" },
{ ["bidi_class"] = "R" },
{ ["bidi_class"] = "R" },
}
)
local W4 = function(run)
for i = 1, #run do
if "ES" == run[i].bidi_class then
-- A single European separator between two European numbers
-- changes to a European number.
if (
1 < i and
#run > i and
"EN" == run[i-1].bidi_class and
"EN" == run[i+1].bidi_class
) then
run[i].bidi_class = "EN"
end
end
if "CS" == run[i].bidi_class then
-- A single common separator between two numbers of the same
-- type changes to that type.
if (
1 < i and
#run > i and
"EN" == run[i-1].bidi_class and
"EN" == run[i+1].bidi_class
) then
run[i].bidi_class = "EN"
end
if (
1 < i and
#run > i and
"AN" == run[i-1].bidi_class and
"AN" == run[i+1].bidi_class
) then
run[i].bidi_class = "AN"
end
end
end
return run
end
test_rule(
"Test·W4:·EN·ES·EN·→·EN·EN·EN",
W4,
{
{ ["bidi_class"] = "EN" },
{ ["bidi_class"] = "ES" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "EN" },
{ ["bidi_class"] = "EN" },
{ ["bidi_class"] = "EN" },
}
)
test_rule(
"Test·W4:·EN·CS·EN·→·EN·EN·EN",
W4,
{
{ ["bidi_class"] = "EN" },
{ ["bidi_class"] = "CS" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "EN" },
{ ["bidi_class"] = "EN" },
{ ["bidi_class"] = "EN" },
}
)
test_rule(
"Test·W4:·AN·CS·AN·→·AN·AN·AN",
W4,
{
{ ["bidi_class"] = "AN" },
{ ["bidi_class"] = "CS" },
{ ["bidi_class"] = "AN" },
},
{
{ ["bidi_class"] = "AN" },
{ ["bidi_class"] = "AN" },
{ ["bidi_class"] = "AN" },
}
)
bidi.W5 = function(run)
for i = 1, #run do
if "ET" == run[i].bidi_class then
-- A sequence of European terminators adjacent to European
-- numbers changes to all European numbers.
if (
1 < i and
"EN" == run[i-1].bidi_class
) or (
#run > i and
"EN" == run[i+1].bidi_class
) then
run[i].bidi_class = "EN"
end
end
end
return run
end
bidi.W6 = function(run)
for i = 1, #run do
if (
"ES" == run[i].bidi_class or
"ET" == run[i].bidi_class or
"CS" == run[i].bidi_class
) then
-- Otherwise, separators and terminators change to Other
-- Neutral.
run[i].bidi_class = "ON"
end
end
return run
end
bidi.W7 = function(run, sos)
for i = 1, #run do
if "EN" == run[i].bidi_class then
-- Search backward from each instance of a European number
-- until the first strong type (R, L, or sos) is found. If an
-- L is found, then change the type of the European number to
-- L.
local previous_strong_bidi_class = nil
local j = i
repeat
local previous_bidi_class = run[j].bidi_class
if (
"L" == previous_bidi_class or -- left-to-right
"R" == previous_bidi_class or -- right-to-left
sos == previous_bidi_class
) then
previous_strong_bidi_class = previous_bidi_class
end
j = j - 1
until(
nil ~= previous_strong_bidi_class or
0 == j
)
if "L" == previous_strong_bidi_class then
run[i].bidi_class = "L"
end
end
end
return run
end
bidi.resolve_weak_types = function(run, sos)
run = bidi.W1(run, sos)
run = W2(run, sos)
run = W3(run)
run = W4(run)
run = bidi.W5(run)
run = bidi.W6(run)
run = bidi.W7(run, sos)
return run
end
bidi.resolve_ni_types = function(run, embedding_direction)
for i, element in ipairs(run) do
-- N0
-- FIXME: Process bracket pairs!
-- N1
if (
"B" == run[i].bidi_class or
"S" == run[i].bidi_class or
"WS" == run[i].bidi_class or
"ON" == run[i].bidi_class or
"FSI" == run[i].bidi_class or
"LRI" == run[i].bidi_class or
"RLI" == run[i].bidi_class or
"PDI" == run[i].bidi_class
) then
-- A sequence of NIs takes the direction of the surrounding
-- strong text if the text on both sides has the same
-- direction.
local previous_direction = nil
local j = i
repeat
local previous_bidi_class = run[j].bidi_class
if (
"L" == previous_bidi_class or -- left-to-right
"R" == previous_bidi_class -- right-to-left
) then
previous_direction = previous_bidi_class
end
j = j - 1
until(
nil ~= previous_direction or
0 == j
)
local next_direction = nil
local j = i
repeat
local next_bidi_class = run[j].bidi_class
if (
"L" == next_bidi_class or -- left-to-right
"R" == next_bidi_class -- right-to-left
) then
next_direction = next_bidi_class
end
j = j + 1
until(
nil ~= next_direction or
#run + 1 == j
)
if (
1 < i and
#run > i and
"L" == previous_direction and
"L" == next_direction
) then
run[i].bidi_class = "L"
elseif (
-- European and Arabic numbers act as if they were R in
-- terms of their influence on NIs.
1 < i and
#run > i and
(
"R" == previous_direction or
"EN" == previous_direction or
"AN" == previous_direction
) and (
"R" == next_direction or
"EN" == next_direction or
"AN" == next_direction
)
) then
run[i].bidi_class = "R"
-- N2
else
-- Any remaining NIs take the embedding direction.
run[i].bidi_class = embedding_direction
end
end
end
return run
end
bidi.resolve_implicit_types = function(run, embedding_level)
for i, element in ipairs(run) do
-- I1
if 0 == embedding_level % 2 then
-- For all characters with an even (left-to-right)
-- embedding level, those of type R go up one level and those
-- of type AN or EN go up two levels.
if "R" == run[i].bidi_class then
run[i].embedding_level = run[i].embedding_level + 1
elseif (
"AN" == run[i].bidi_class or
"EN" == run[i].bidi_class
) then
run[i].embedding_level = run[i].embedding_level + 2
end
-- I2
else
-- For all characters with an odd (right-to-left) embedding
-- level, those of type L, EN or AN go up one level.
if (
"L" == run[i].bidi_class or
"EN" == run[i].bidi_class or
"AN" == run[i].bidi_class
) then
run[i].embedding_level = run[i].embedding_level + 1
end
end
end
return run
end
-- reverse any sequences at minimum_embedding_level or higher
bidi.reverse_sequences = function(run, minimum_embedding_level)
local sequence_start
local sequence_end
for i = 1, #run do
if (
minimum_embedding_level <= run[i].embedding_level and
nil == sequence_start
) then
-- found the start of a sequence
sequence_start = i
elseif (
minimum_embedding_level > run[i].embedding_level and
nil ~= sequence_start
) then
-- found the end of a sequence
sequence_end = i
end
if (
nil ~= sequence_start and
nil ~= sequence_end
) then
-- extract sequence
local sequence = {}
for j = 1, sequence_end - sequence_start do
sequence[#sequence+1] = run[sequence_start + j - 1]
end
-- insert sequence reversed
for k = 1, #sequence do
run[sequence_start + k - 1] = sequence[#sequence - k + 1]
end
sequence_start = nil
sequence_end = nil
sequence = {}
end
end
return run
end
bidi.reorder_resolved_levels = function(run, paragraph_embedding_level)
-- L1
-- FIXME: Reset some embedding levels to paragraph embedding level!
-- L2
-- From the highest level found in the text to the lowest odd level
-- on each line, including intermediate levels not actually present
-- in the text, reverse any contiguous sequence of characters that
-- are at that level or higher.
local max_embedding_level = 0
for _, element in ipairs(run) do
if max_embedding_level < element.embedding_level then
max_embedding_level = element.embedding_level
end
end
assert(
"number" == type(max_embedding_level)
)
for minimum_embedding_level = max_embedding_level, 1, -1 do
run = bidi.reverse_sequences(run, minimum_embedding_level)
end
-- L3
-- FIXME: Fix combining marks applied to right-to-left characters.
-- L4
-- FIXME: Replace characters by mirrored glyphs.
return run
end
--[[
dofile("utf8.lua")
local text = "Reuben Rivlin (ראובן ריבלין; * 1939 in Jerusalem)"
local text_reordered = utf8.codepoints_to_text(
bidi.get_visual_reordering(
utf8.text_to_codepoints(
text
)
)
)
print(text)
print(text_reordered)
--]]

View File

@ -1,3 +1,7 @@
Arabic:
نص حكيم له سر قاطع وذو شأن عظيم
مكتوب على ثوب أخضر ومغلف بجلد أزرق
Azeri:
Zəfər, jaketini də papağını da götür,
bu axşam hava çox soyuq olacaq.
@ -13,29 +17,27 @@ Czech:
Danish:
Høj bly gom vandt fræk sexquiz på wc.
Esperanto:
Eĥoŝanĝo ĉiuĵaŭde.
Ethiopic:
ሰማይ አይታረስ ንጉሥ አይከሰስ።
Finnish:
Törkylempijävongahdus.
Greek:
Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία
Georgian:
გთხოვთ ახლავე გაიაროთ რეგისტრაცია
German (non-ASCII letters):
Heizölrückstoßabdämpfung
Hebrew (does not work yet):
Greek:
Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία
Hebrew:
נקודה מודגשת
Icelandic:
Kæmi ný öxi hér, ykist þjófum nú bæði
víl og ádrepa.
Irish:
d'Ith cat mór dubh na héisc lofa go pras
Russian:
В чащах юга жил бы цитрус?
Да, но фальшивый экземпляр!
Hungarian:
Árvíztűrő tükörfúrógép
@ -45,21 +47,39 @@ Hiragana:
うゐのおくやまけふこえて
あさきゆめみしゑひもせす
Icelandic:
Kæmi ný öxi hér, ykist þjófum nú bæði
víl og ádrepa.
Irish:
d'Ith cat mór dubh na héisc lofa go pras
Katakana:
イロハニホヘト チリヌルヲ
ワカヨタレソ ツネナラム
ウヰノオクヤマ ケフコエテ
アサキユメミシ ヱヒモセスン
Georgian:
გთხოვთ ახლავე გაიაროთ რეგისტრაცია
Korean:
다람쥐 헌 쳇바퀴에 타고파
Ethiopic:
ሰማይ አይታረስ ንጉሥ አይከሰስ።
Mayalayam:
ബ്qരഹ്മപുരത്തേക്ക്
Runic:
ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ
Russian:
В чащах юга жил бы цитрус?
Да, но фальшивый экземпляр!
Thai:
พ่อขุนรามคำแหงมหาราช
Uyghur:
زۆھرەگۈل ئابدۇۋاجىت فرانسىيەنىڭ
پارىژدىكى خېلى بىشەم ئوقۇغۇچى.
Various latin alphabet variants:
𝐛𝐨𝐥𝐝 𝖋𝖗𝖆𝖐𝖙𝖚𝖗 𝒊𝒕𝒂𝒍𝒊𝒄 𝓼𝓬𝓻𝓲𝓹𝓽
𝕕𝕠𝕦𝕓𝕝𝕖-𝕤𝕥𝕣𝕦𝕔𝕜 𝚖𝚘𝚗𝚘𝚜𝚙𝚊𝚌𝚎
@ -71,6 +91,26 @@ Emoji:
🔀 🔁 🔂 🔃 🔄 🔅 🔆 🔇
😀 😁 😂 😃 😄 😅 😆 😇
Line breaking:
U+000A LINE FEED
LINE 1
LINE 2
U+000A LINE FEED
LINE 1 LINE 2
U+000D CARRIAGE RETURN
U+000A LINE FEED
LINE 1
LINE 2
U+0085 NEXT LINE
LINE 1… LINE 2
U+2029 PARAGRAPH SEPARATOR
LINE 1 LINE 2
Horizontal tabulator:
1 2 3 4 5 6
t a b
@ -107,8 +147,10 @@ Combining characters:
U+0EB3 LAO VOWEL SIGN AM
ກຳ
Thai:
พ่อขุนรามคำแหงมหาราช
Bidirectional text:
Mayalayam:
ബ്qരഹ്മപുരത്തേക്ക്
Year rendered before hebrew name, no LRM:
Rivlin (ראובן ריבלין; * 1939 Jerusalem)
Year rendered after hebrew name with LRM:
Rivlin (ראובן ריבלין‎; * 1939 Jerusalem)

View File

@ -14,6 +14,7 @@ Overflow oder eine Format String Vulnerability zwischen die anderen
Codezeilen und schreibe das auch nicht dran.
]]--
dofile("bidi.lua")
dofile("pixelops.lua")
dofile("unicodedata.lua")
dofile("utf8.lua")
@ -225,8 +226,9 @@ hexfont.render_line = function(self, text)
for i = 1, 16 do
result[i] = {}
end
local codepoints = utf8.text_to_codepoints(text)
-- FIXME: only works for LTR, should use UAX #9
local codepoints = bidi.get_visual_reordering(
utf8.text_to_codepoints(text)
)
for i = 1, #codepoints do
local codepoint = codepoints[i]
local bitmap_hex = self[codepoint]
@ -294,7 +296,28 @@ hexfont.render_text = function(self, text)
local result
local max_width = 0
-- TODO: implement UAX #14
-- According to UAX #14, line breaks happen on:
-- • U+000A LINE FEED
-- • U+000D CARRIAGE RETURN (except as part of CRLF)
-- • U+0085 NEXT LINE
-- • U+2029 PARAGRAPH SEPARATOR
--
-- Hack: Replace all of those with LINE FEED.
-- FIXME: This makes CRLF into two newlines …
local codepoints = utf8.text_to_codepoints(text)
for i, codepoint in ipairs(codepoints) do
if (
0x000D == codepoints[i] or
0x0085 == codepoints[i] or
0x2029 == codepoints[i]
) then
codepoints[i] = 0x000A
end
end
-- FIXME: Code below should only operate on codepoints! Converting
-- back and forth makes it needlessly slow but I do not know how
-- to split a table properly to get a single table for each line …
text = utf8.codepoints_to_text(codepoints)
for utf8_line in string.gmatch(text .. "\n", "([^\n]*)\n") do
local pixels = self:render_line(utf8_line)
assert( nil ~= pixels )

View File

@ -16,6 +16,68 @@ Codezeilen und schreibe das auch nicht dran.
utf8 = {}
-- convert a table with codepoints into an UTF-8 string
-- inspired by <http://news.dieweltistgarnichtso.net/bin/unicode>
utf8.codepoints_to_text = function(codepoints)
assert(
"table" == type(codepoints)
)
for _, codepoint in ipairs(codepoints) do
if (
0 >= codepoint or
1114111 < codepoint or
math.floor(codepoint) ~= codepoint
) then
error(
string.format(
"invalid codepoint: %s",
codepoint
)
)
end
end
local codepoints_encoded = {}
codepoints_encoded.append = function(...)
codepoints_encoded[#codepoints_encoded+1] = string.char(...)
end
for _, codepoint in ipairs(codepoints) do
if codepoint <= 127 then
-- one byte encoding
codepoints_encoded.append(codepoint)
elseif codepoint <= 2048 then
-- two bytes encoding
codepoints_encoded.append(
math.floor(codepoint / 64) + 192,
codepoint % 64 + 128
)
elseif codepoint <= 65535 then
-- three bytes encoding
codepoints_encoded.append(
math.floor(codepoint / 4096) + 224,
math.floor(codepoint / 64) % 64 + 128,
codepoint % 64 + 128
)
elseif codepoint <= 1114111 then
-- four bytes encoding
codepoints_encoded.append(
math.floor(codepoint / 262144) + 240,
math.floor(codepoint / 4096) % 64 + 128,
math.floor(codepoint / 64) % 64 + 128,
codepoint % 64 + 128
)
end
end
return table.concat(codepoints_encoded)
end
-- Test one codepoint for each byte length:
local text = utf8.codepoints_to_text(
{119, 240, 9829, 66376} -- U+0077 U+00F0 U+2665 U+10348
)
assert(
"wð♥𐍈" == text
)
-- convert an UTF-8 string into a table with codepoints
-- inspired by <https://lua-users.org/wiki/LuaUnicode>
utf8.text_to_codepoints = function(text)