* Refactor newline handling to handle CR, NEL, PSEP

2023-03-21 17:24:44 +01:00 · 2023-03-21 17:24:44 +01:00 · 9b984e0d2f
parent 6bd77156c8
commit 9b984e0d2f
3 changed files with 104 additions and 1 deletions
--- a/example.txt
+++ b/example.txt
@ -71,6 +71,26 @@ Emoji:
 	🔀	🔁	🔂	🔃	🔄	🔅	🔆	🔇	
 	😀	😁	😂	😃	😄	😅	😆	😇	

+Line breaking:
+
+  U+000A LINE FEED
+  LINE 1
+  LINE 2
+
+  U+000A LINE FEED
+  LINE 1
  LINE 2
+
+  U+000D CARRIAGE RETURN
+  U+000A LINE FEED
+  LINE 1
+  LINE 2
+
+  U+0085 NEXT LINE
+  LINE 1  LINE 2
+
+  U+2029 PARAGRAPH SEPARATOR
+  LINE 1   LINE 2
+
 Horizontal tabulator:
 	1	2	3	4	5	6
 	t	a	b
--- a/hexfont.lua
+++ b/hexfont.lua
@ -294,7 +294,28 @@ hexfont.render_text = function(self, text)

   local result
   local max_width = 0
-   -- TODO: implement UAX #14
+   -- According to UAX #14, line breaks happen on:
+   -- • U+000A LINE FEED
+   -- • U+000D CARRIAGE RETURN (except as part of CRLF)
+   -- • U+0085 NEXT LINE
+   -- • U+2029 PARAGRAPH SEPARATOR
+   --
+   -- Hack: Replace all of those with LINE FEED.
+   -- FIXME: This makes CRLF into two newlines …
+   local codepoints = utf8.text_to_codepoints(text)
+   for i, codepoint in ipairs(codepoints) do
+      if (
+         0x000D == codepoints[i] or
+         0x0085 == codepoints[i] or
+         0x2029 == codepoints[i]
+      ) then
+         codepoints[i] = 0x000A
+      end
+   end
+   -- FIXME: Code below should only operate on codepoints! Converting
+   -- back and forth makes it needlessly slow – but I do not know how
+   -- to split a table properly to get a single table for each line …
+   text = utf8.codepoints_to_text(codepoints)
   for utf8_line in string.gmatch(text .. "\n", "([^\n]*)\n") do
      local pixels = self:render_line(utf8_line)
      assert( nil ~= pixels )
--- a/utf8.lua
+++ b/utf8.lua
@ -16,6 +16,68 @@ Codezeilen und schreibe das auch nicht dran.

 utf8 = {}

+-- convert a table with codepoints into an UTF-8 string
+-- inspired by <http://news.dieweltistgarnichtso.net/bin/unicode>
+utf8.codepoints_to_text = function(codepoints)
+   assert(
+      "table" == type(codepoints)
+   )
+   for _, codepoint in ipairs(codepoints) do
+      if (
+         0 >= codepoint or
+         1114111 < codepoint or
+         math.floor(codepoint) ~= codepoint
+      ) then
+         error(
+            string.format(
+               "invalid codepoint: %s",
+               codepoint
+            )
+         )
+      end
+   end
+   local codepoints_encoded = {}
+   codepoints_encoded.append = function(...)
+      codepoints_encoded[#codepoints_encoded+1] = string.char(...)
+   end
+   for _, codepoint in ipairs(codepoints) do
+      if codepoint <= 127 then
+         -- one byte encoding
+         codepoints_encoded.append(codepoint)
+      elseif codepoint <= 2048 then
+         -- two bytes encoding
+         codepoints_encoded.append(
+            math.floor(codepoint / 64) + 192,
+            codepoint % 64 + 128
+         )
+      elseif codepoint <= 65535 then
+         -- three bytes encoding
+         codepoints_encoded.append(
+            math.floor(codepoint / 4096) + 224,
+            math.floor(codepoint / 64) % 64 + 128,
+            codepoint % 64 + 128
+         )
+      elseif codepoint <= 1114111 then
+         -- four bytes encoding
+         codepoints_encoded.append(
+            math.floor(codepoint / 262144) + 240,
+            math.floor(codepoint / 4096) % 64 + 128,
+            math.floor(codepoint / 64) % 64 + 128,
+            codepoint % 64 + 128
+         )
+      end
+   end
+   return table.concat(codepoints_encoded)
+end
+
+-- Test one codepoint for each byte length:
+local text = utf8.codepoints_to_text(
+   {119, 240, 9829, 66376}  -- U+0077 U+00F0 U+2665 U+10348
+)
+assert(
+   "wð♥𐍈" == text
+)
+
 -- convert an UTF-8 string into a table with codepoints
 -- inspired by <https://lua-users.org/wiki/LuaUnicode>
 utf8.text_to_codepoints = function(text)