* Refactor newline handling to handle CR, NEL, PSEP

This commit is contained in:
Nils Dagsson Moskopp 2023-03-21 17:24:44 +01:00
parent 6bd77156c8
commit 9b984e0d2f
3 changed files with 104 additions and 1 deletions

View File

@ -71,6 +71,26 @@ Emoji:
🔀 🔁 🔂 🔃 🔄 🔅 🔆 🔇
😀 😁 😂 😃 😄 😅 😆 😇
Line breaking:
U+000A LINE FEED
LINE 1
LINE 2
U+000A LINE FEED
LINE 1 LINE 2
U+000D CARRIAGE RETURN
U+000A LINE FEED
LINE 1
LINE 2
U+0085 NEXT LINE
LINE 1… LINE 2
U+2029 PARAGRAPH SEPARATOR
LINE 1 LINE 2
Horizontal tabulator:
1 2 3 4 5 6
t a b

View File

@ -294,7 +294,28 @@ hexfont.render_text = function(self, text)
local result
local max_width = 0
-- TODO: implement UAX #14
-- According to UAX #14, line breaks happen on:
-- • U+000A LINE FEED
-- • U+000D CARRIAGE RETURN (except as part of CRLF)
-- • U+0085 NEXT LINE
-- • U+2029 PARAGRAPH SEPARATOR
--
-- Hack: Replace all of those with LINE FEED.
-- FIXME: This makes CRLF into two newlines …
local codepoints = utf8.text_to_codepoints(text)
for i, codepoint in ipairs(codepoints) do
if (
0x000D == codepoints[i] or
0x0085 == codepoints[i] or
0x2029 == codepoints[i]
) then
codepoints[i] = 0x000A
end
end
-- FIXME: Code below should only operate on codepoints! Converting
-- back and forth makes it needlessly slow but I do not know how
-- to split a table properly to get a single table for each line …
text = utf8.codepoints_to_text(codepoints)
for utf8_line in string.gmatch(text .. "\n", "([^\n]*)\n") do
local pixels = self:render_line(utf8_line)
assert( nil ~= pixels )

View File

@ -16,6 +16,68 @@ Codezeilen und schreibe das auch nicht dran.
utf8 = {}
-- convert a table with codepoints into an UTF-8 string
-- inspired by <http://news.dieweltistgarnichtso.net/bin/unicode>
utf8.codepoints_to_text = function(codepoints)
assert(
"table" == type(codepoints)
)
for _, codepoint in ipairs(codepoints) do
if (
0 >= codepoint or
1114111 < codepoint or
math.floor(codepoint) ~= codepoint
) then
error(
string.format(
"invalid codepoint: %s",
codepoint
)
)
end
end
local codepoints_encoded = {}
codepoints_encoded.append = function(...)
codepoints_encoded[#codepoints_encoded+1] = string.char(...)
end
for _, codepoint in ipairs(codepoints) do
if codepoint <= 127 then
-- one byte encoding
codepoints_encoded.append(codepoint)
elseif codepoint <= 2048 then
-- two bytes encoding
codepoints_encoded.append(
math.floor(codepoint / 64) + 192,
codepoint % 64 + 128
)
elseif codepoint <= 65535 then
-- three bytes encoding
codepoints_encoded.append(
math.floor(codepoint / 4096) + 224,
math.floor(codepoint / 64) % 64 + 128,
codepoint % 64 + 128
)
elseif codepoint <= 1114111 then
-- four bytes encoding
codepoints_encoded.append(
math.floor(codepoint / 262144) + 240,
math.floor(codepoint / 4096) % 64 + 128,
math.floor(codepoint / 64) % 64 + 128,
codepoint % 64 + 128
)
end
end
return table.concat(codepoints_encoded)
end
-- Test one codepoint for each byte length:
local text = utf8.codepoints_to_text(
{119, 240, 9829, 66376} -- U+0077 U+00F0 U+2665 U+10348
)
assert(
"wð♥𐍈" == text
)
-- convert an UTF-8 string into a table with codepoints
-- inspired by <https://lua-users.org/wiki/LuaUnicode>
utf8.text_to_codepoints = function(text)