+ Modify test for bidi rule W2 so it passes

* Refactor tests for bidi rules W2, W3, W4
+ Add tests for bidi rule W4
2023-09-02 15:20:42 +02:00 · 2023-03-26 21:29:32 +02:00 · 2023-03-26 20:46:38 +02:00 · 2023-03-26 20:42:55 +02:00 · 2023-03-26 20:41:52 +02:00 · 2023-03-22 18:22:17 +01:00
5 changed files with 804 additions and 28 deletions
--- a/README.rst
+++ b/README.rst
@ -223,10 +223,15 @@ Hint 1: <https://unifoundry.com/unifont/index.html>

 Hint 2: <https://trevorldavis.com/R/hexfont/>

-Why is Arabic / Hebrew / Urdu etc. text rendered wrong, i.e. left to right?
-+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+Why is Arabic / Hebrew / Urdu etc. text rendered somewhat wrong?
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-I did not implement the Unicode Bidirectional Algorithm. Patches welcome.
+I did not implement the entire `Unicode Bidirectional Algorithm`_.
+
+If you are able to read a right-to-left language, please help …
+
+.. _`Unicode Bidirectional Algorithm`:
+   https://www.unicode.org/reports/tr9/

 Why is the generated pixels table upside down?
 ++++++++++++++++++++++++++++++++++++++++++++++
--- a/bidi.lua
+++ b/bidi.lua
@ -0,0 +1,644 @@
+#!/usr/bin/env lua5.1
+
+--[[
+Copyright © 2023  Nils Dagsson Moskopp (erle)
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+Dieses Programm hat das Ziel, die Medienkompetenz der Leser zu
+steigern. Gelegentlich packe ich sogar einen handfesten Buffer
+Overflow oder eine Format String Vulnerability zwischen die anderen
+Codezeilen und schreibe das auch nicht dran.
+]]--
+
+dofile("unicodedata.lua")
+
+bidi = {}
+
+local get_paragraph_direction = function(codepoints)
+   -- Find the first character of type L, AL or R …
+   -- See <https://www.unicode.org/reports/tr9/#P2>
+   for _, codepoint in ipairs(codepoints) do
+      if unicodedata[codepoint] then
+         local bidi_class = unicodedata[codepoint].bidi_class
+         if (
+            "L" == bidi_class or  -- left-to-right
+            "R" == bidi_class or  -- right-to-left
+            "AL" == bidi_class  -- right-to-left (arabic)
+         ) then
+            return bidi_class
+         end
+      end
+   end
+end
+
+-- See <https://www.unicode.org/reports/tr9/>
+bidi.get_visual_reordering = function(codepoints)
+   -- rule P2
+   local paragraph_direction =
+      get_paragraph_direction(codepoints) or "L"
+
+   -- rule P3
+   local paragraph_embedding_level = 0
+   if (
+      "R" == paragraph_direction or
+      "AL" == paragraph_direction
+   ) then
+      paragraph_embedding_level = 1
+   end
+
+   -- FIXME: Rule X1 to X10 are not implemented yet. This basically
+   -- means that explicit levels or display directions are ignored.
+
+   local run = {}
+   for i, codepoint in ipairs(codepoints) do
+      run[i] = {}
+      run[i].codepoint = codepoint
+      run[i].bidi_class =
+         unicodedata[codepoint] and
+         unicodedata[codepoint].bidi_class or paragraph_direction
+      run[i].embedding_level = 0
+   end
+
+   -- Hack: This code is almost certainly non-conforming …
+   -- but it seems to “kinda” work. Someone should fix it!
+   run = bidi.resolve_weak_types(run, paragraph_direction)
+   run = bidi.resolve_ni_types(run, paragraph_direction)
+   run = bidi.resolve_implicit_types(run, paragraph_embedding_level)
+   run = bidi.reorder_resolved_levels(run, paragraph_embedding_level)
+
+   codepoints_reordered = {}
+   for i, element in ipairs(run) do
+      codepoints_reordered[i] = element.codepoint
+   end
+   return codepoints_reordered
+end
+
+bidi.W1 = function(run, sos)
+   -- Examine each nonspacing mark (NSM) in the isolating run
+   -- sequence, and change the type of the NSM to Other Neutral
+   -- if the previous character is an isolate initiator or PDI,
+   -- and to the type of the previous character otherwise. If
+   -- the NSM is at the start of the isolating run sequence, it
+   -- will get the type of sos.
+   for i = 1, #run do
+      if "NSM" == run[i].bidi_class then
+         if 1 == i then
+            run[i].bidi_class = sos
+         else
+            -- FIXME: handle isolate initiator, PDI
+            run[i].bidi_class = run[i-1].bidi_class
+         end
+      end
+   end
+   return run
+end
+
+local W2 = function(run, sos)
+   -- sos is the text ordering type assigned to the virtual position
+   -- before an isolating run sequence
+   assert(
+      "AL" == sos or  -- FIXME: find the actual bug & remove this line
+      "L" == sos or
+      "R" == sos
+   )
+   for i = 1, #run do
+      if "EN" == run[i].bidi_class then
+         -- Search backward from each instance of a European number
+         -- until the first strong type (R, L, AL, or sos) is found.
+         local previous_strong_bidi_class = nil
+         local j = i
+         repeat
+            j = j - 1
+            local previous_bidi_class = run[j].bidi_class
+            if (
+               "L" == previous_bidi_class or  -- left-to-right
+               "R" == previous_bidi_class or  -- right-to-left
+               "AL" == previous_bidi_class or -- right-to-left (arabic)
+               sos == previous_bidi_class
+            ) then
+               previous_strong_bidi_class = previous_bidi_class
+            end
+         until(
+            nil ~= previous_strong_bidi_class or
+            1 == j
+         )
+         -- If an AL is found, change the type of the European number
+         -- to Arabic number.
+         if "AL" == previous_strong_bidi_class then
+            run[i].bidi_class = "AN"
+         end
+      end
+   end
+   return run
+end
+
+dump_table = function(table_, indentation)
+   assert( "table" == type(table_) )
+
+   local indentation = indentation or 0
+   assert( "number" == type(indentation) )
+
+   local result = {}
+   for key, value in pairs(table_) do
+      local prefix = "\n" .. ("   "):rep(indentation) .. key .. ": "
+      if "table" == type(value) then
+         result[#result + 1] = prefix .. dump_table(value, indentation + 1)
+      else
+         result[#result + 1] = prefix .. tostring(value)
+      end
+   end
+   return table.concat(result)
+end
+
+local test_rule = function(
+      description,
+      rule,
+      test_input,
+      expected_output,
+      ...
+)
+   assert( "string" == type(description) )
+   assert( "function" == type(rule) )
+   assert( "table" == type(test_input) )
+   assert( "table" == type(expected_output) )
+   local test_output = rule(test_input, ...)
+   for i = 1, #test_input do
+      assert(
+         test_output[i].bidi_class == expected_output[i].bidi_class,
+         description ..
+         dump_table(test_output)
+      )
+   end
+end
+
+test_rule(
+   "Test W2: AL EN → AL AN",
+   W2,
+   {
+      { ["bidi_class"] = "AL" },
+      { ["bidi_class"] = "EN" },
+   },
+   {
+      { ["bidi_class"] = "AL" },
+      { ["bidi_class"] = "AN" },
+   },
+   "L"
+)
+
+test_rule(
+   "Test W2: AL NI EN → AL NI AN",
+   W2,
+   {
+      { ["bidi_class"] = "AL" },
+      { ["bidi_class"] = "NI" },
+      { ["bidi_class"] = "EN" },
+   },
+   {
+      { ["bidi_class"] = "AL" },
+      { ["bidi_class"] = "NI" },
+      { ["bidi_class"] = "AN" },
+   },
+   "L"
+)
+
+test_rule(
+   "Test W2: sos NI EN → sos NI EN (sos = L)",
+   W2,
+   {
+      { ["bidi_class"] = "NI" },
+      { ["bidi_class"] = "EN" },
+   },
+   {
+      { ["bidi_class"] = "NI" },
+      { ["bidi_class"] = "EN" },
+   },
+   "L"
+)
+
+test_rule(
+   "Test W2: sos NI EN → sos NI EN (sos = R)",
+   W2,
+   {
+      { ["bidi_class"] = "NI" },
+      { ["bidi_class"] = "EN" },
+   },
+   {
+      { ["bidi_class"] = "NI" },
+      { ["bidi_class"] = "EN" },
+   },
+   "R"
+)
+
+test_rule(
+   "Test W2: L NI EN → L NI EN",
+   W2,
+   {
+      { ["bidi_class"] = "L" },
+      { ["bidi_class"] = "NI" },
+      { ["bidi_class"] = "EN" },
+   },
+   {
+      { ["bidi_class"] = "L" },
+      { ["bidi_class"] = "NI" },
+      { ["bidi_class"] = "EN" },
+   },
+   "L"
+)
+
+test_rule(
+   "Test W2: R NI EN → R NI EN",
+   W2,
+   {
+      { ["bidi_class"] = "R" },
+      { ["bidi_class"] = "NI" },
+      { ["bidi_class"] = "EN" },
+   },
+   {
+      { ["bidi_class"] = "R" },
+      { ["bidi_class"] = "NI" },
+      { ["bidi_class"] = "EN" },
+   },
+   "L"
+)
+
+local W3 = function(run)
+   -- Change al ALs to R.
+   for i = 1, #run do
+      if "AL" == run[i].bidi_class then
+         run[i].bidi_class = "R"
+      end
+   end
+   return run
+end
+
+test_rule(
+   "Test W3: AL AL AL → R R R",
+   W3,
+   {
+      { ["bidi_class"] = "AL" },
+      { ["bidi_class"] = "AL" },
+      { ["bidi_class"] = "AL" },
+   },
+   {
+      { ["bidi_class"] = "R" },
+      { ["bidi_class"] = "R" },
+      { ["bidi_class"] = "R" },
+   }
+)
+
+local W4 = function(run)
+   for i = 1, #run do
+      if "ES" == run[i].bidi_class then
+         -- A single European separator between two European numbers
+         -- changes to a European number.
+         if (
+            1 < i and
+            #run > i and
+            "EN" == run[i-1].bidi_class and
+            "EN" == run[i+1].bidi_class
+         ) then
+            run[i].bidi_class = "EN"
+         end
+      end
+      if "CS" == run[i].bidi_class then
+         -- A single common separator between two numbers of the same
+         -- type changes to that type.
+         if (
+            1 < i and
+            #run > i and
+            "EN" == run[i-1].bidi_class and
+            "EN" == run[i+1].bidi_class
+         ) then
+            run[i].bidi_class = "EN"
+         end
+         if (
+            1 < i and
+            #run > i and
+            "AN" == run[i-1].bidi_class and
+            "AN" == run[i+1].bidi_class
+         ) then
+            run[i].bidi_class = "AN"
+         end
+      end
+   end
+   return run
+end
+
+test_rule(
+   "Test·W4:·EN·ES·EN·→·EN·EN·EN",
+   W4,
+   {
+      { ["bidi_class"] = "EN" },
+      { ["bidi_class"] = "ES" },
+      { ["bidi_class"] = "EN" },
+   },
+   {
+      { ["bidi_class"] = "EN" },
+      { ["bidi_class"] = "EN" },
+      { ["bidi_class"] = "EN" },
+   }
+)
+
+test_rule(
+   "Test·W4:·EN·CS·EN·→·EN·EN·EN",
+   W4,
+   {
+      { ["bidi_class"] = "EN" },
+      { ["bidi_class"] = "CS" },
+      { ["bidi_class"] = "EN" },
+   },
+   {
+      { ["bidi_class"] = "EN" },
+      { ["bidi_class"] = "EN" },
+      { ["bidi_class"] = "EN" },
+   }
+)
+
+test_rule(
+   "Test·W4:·AN·CS·AN·→·AN·AN·AN",
+   W4,
+   {
+      { ["bidi_class"] = "AN" },
+      { ["bidi_class"] = "CS" },
+      { ["bidi_class"] = "AN" },
+   },
+   {
+      { ["bidi_class"] = "AN" },
+      { ["bidi_class"] = "AN" },
+      { ["bidi_class"] = "AN" },
+   }
+)
+
+bidi.W5 = function(run)
+   for i = 1, #run do
+      if "ET" == run[i].bidi_class then
+         -- A sequence of European terminators adjacent to European
+         -- numbers changes to all European numbers.
+         if (
+            1 < i and
+            "EN" == run[i-1].bidi_class
+         ) or (
+            #run > i and
+            "EN" == run[i+1].bidi_class
+         ) then
+            run[i].bidi_class = "EN"
+         end
+      end
+   end
+   return run
+end
+
+bidi.W6 = function(run)
+   for i = 1, #run do
+      if (
+         "ES" == run[i].bidi_class or
+         "ET" == run[i].bidi_class or
+         "CS" == run[i].bidi_class
+      ) then
+         -- Otherwise, separators and terminators change to Other
+         -- Neutral.
+         run[i].bidi_class = "ON"
+      end
+   end
+   return run
+end
+
+bidi.W7 = function(run, sos)
+   for i = 1, #run do
+      if "EN" == run[i].bidi_class then
+         -- Search backward from each instance of a European number
+         -- until the first strong type (R, L, or sos) is found. If an
+         -- L is found, then change the type of the European number to
+         -- L.
+         local previous_strong_bidi_class = nil
+         local j = i
+         repeat
+            local previous_bidi_class = run[j].bidi_class
+            if (
+               "L" == previous_bidi_class or  -- left-to-right
+               "R" == previous_bidi_class or  -- right-to-left
+               sos == previous_bidi_class
+            ) then
+               previous_strong_bidi_class = previous_bidi_class
+            end
+            j = j - 1
+         until(
+            nil ~= previous_strong_bidi_class or
+            0 == j
+         )
+         if "L" == previous_strong_bidi_class then
+            run[i].bidi_class = "L"
+         end
+      end
+   end
+   return run
+end
+
+bidi.resolve_weak_types = function(run, sos)
+   run = bidi.W1(run, sos)
+   run = W2(run, sos)
+   run = W3(run)
+   run = W4(run)
+   run = bidi.W5(run)
+   run = bidi.W6(run)
+   run = bidi.W7(run, sos)
+   return run
+end
+
+bidi.resolve_ni_types = function(run, embedding_direction)
+   for i, element in ipairs(run) do
+      -- N0
+      -- FIXME: Process bracket pairs!
+      -- N1
+      if (
+         "B" == run[i].bidi_class or
+         "S" == run[i].bidi_class or
+         "WS" == run[i].bidi_class or
+         "ON" == run[i].bidi_class or
+         "FSI" == run[i].bidi_class or
+         "LRI" == run[i].bidi_class or
+         "RLI" == run[i].bidi_class or
+         "PDI" == run[i].bidi_class
+      ) then
+         -- A sequence of NIs takes the direction of the surrounding
+         -- strong text if the text on both sides has the same
+         -- direction.
+         local previous_direction = nil
+         local j = i
+         repeat
+            local previous_bidi_class = run[j].bidi_class
+            if (
+               "L" == previous_bidi_class or  -- left-to-right
+               "R" == previous_bidi_class     -- right-to-left
+            ) then
+               previous_direction = previous_bidi_class
+            end
+            j = j - 1
+         until(
+            nil ~= previous_direction or
+            0 == j
+         )
+         local next_direction = nil
+         local j = i
+         repeat
+            local next_bidi_class = run[j].bidi_class
+            if (
+               "L" == next_bidi_class or  -- left-to-right
+               "R" == next_bidi_class     -- right-to-left
+            ) then
+               next_direction = next_bidi_class
+            end
+            j = j + 1
+         until(
+            nil ~= next_direction or
+            #run + 1 == j
+         )
+         if (
+            1 < i and
+            #run > i and
+            "L" == previous_direction and
+            "L" == next_direction
+         ) then
+            run[i].bidi_class = "L"
+         elseif (
+            -- European and Arabic numbers act as if they were R in
+            -- terms of their influence on NIs.
+            1 < i and
+            #run > i and
+            (
+               "R" == previous_direction or
+               "EN" == previous_direction or
+               "AN" == previous_direction
+            ) and (
+               "R" == next_direction or
+               "EN" == next_direction or
+               "AN" == next_direction
+            )
+         ) then
+            run[i].bidi_class = "R"
+         -- N2
+         else
+            -- Any remaining NIs take the embedding direction.
+            run[i].bidi_class = embedding_direction
+         end
+      end
+   end
+   return run
+end
+
+bidi.resolve_implicit_types = function(run, embedding_level)
+   for i, element in ipairs(run) do
+      -- I1
+      if 0 == embedding_level % 2 then
+         -- For all characters with an even (left-to-right)
+         -- embedding level, those of type R go up one level and those
+         -- of type AN or EN go up two levels.
+         if "R" == run[i].bidi_class then
+            run[i].embedding_level = run[i].embedding_level + 1
+         elseif (
+            "AN" == run[i].bidi_class or
+            "EN" == run[i].bidi_class
+         ) then
+            run[i].embedding_level = run[i].embedding_level + 2
+         end
+      -- I2
+      else
+         -- For all characters with an odd (right-to-left) embedding
+         -- level, those of type L, EN or AN go up one level.
+         if (
+            "L" == run[i].bidi_class or
+            "EN" == run[i].bidi_class or
+            "AN" == run[i].bidi_class
+         ) then
+            run[i].embedding_level = run[i].embedding_level + 1
+         end
+      end
+   end
+   return run
+end
+
+-- reverse any sequences at minimum_embedding_level or higher
+bidi.reverse_sequences = function(run, minimum_embedding_level)
+   local sequence_start
+   local sequence_end
+   for i = 1, #run do
+      if (
+         minimum_embedding_level <= run[i].embedding_level and
+         nil == sequence_start
+      ) then
+         -- found the start of a sequence
+         sequence_start = i
+      elseif (
+         minimum_embedding_level > run[i].embedding_level and
+         nil ~= sequence_start
+      ) then
+         -- found the end of a sequence
+         sequence_end = i
+      end
+      if (
+         nil ~= sequence_start and
+         nil ~= sequence_end
+      ) then
+         -- extract sequence
+         local sequence = {}
+         for j = 1, sequence_end - sequence_start do
+            sequence[#sequence+1] = run[sequence_start + j - 1]
+         end
+         -- insert sequence reversed
+         for k = 1, #sequence do
+            run[sequence_start + k - 1] = sequence[#sequence - k + 1]
+         end
+         sequence_start = nil
+         sequence_end = nil
+         sequence = {}
+      end
+   end
+   return run
+end
+
+bidi.reorder_resolved_levels = function(run, paragraph_embedding_level)
+   -- L1
+   -- FIXME: Reset some embedding levels to paragraph embedding level!
+   -- L2
+   -- From the highest level found in the text to the lowest odd level
+   -- on each line, including intermediate levels not actually present
+   -- in the text, reverse any contiguous sequence of characters that
+   -- are at that level or higher.
+   local max_embedding_level = 0
+   for _, element in ipairs(run) do
+      if max_embedding_level < element.embedding_level then
+         max_embedding_level = element.embedding_level
+      end
+   end
+   assert(
+      "number" == type(max_embedding_level)
+   )
+   for minimum_embedding_level = max_embedding_level, 1, -1 do
+      run = bidi.reverse_sequences(run, minimum_embedding_level)
+   end
+   -- L3
+   -- FIXME: Fix combining marks applied to right-to-left characters.
+   -- L4
+   -- FIXME: Replace characters by mirrored glyphs.
+   return run
+end
+
+--[[
+dofile("utf8.lua")
+
+local text = "Reuben Rivlin (ראובן ריבלין; * 1939 in Jerusalem)"
+local text_reordered = utf8.codepoints_to_text(
+   bidi.get_visual_reordering(
+      utf8.text_to_codepoints(
+         text
+      )
+   )
+)
+
+print(text)
+print(text_reordered)
+--]]
--- a/example.txt
+++ b/example.txt
@ -1,3 +1,7 @@
+Arabic:
+  نص حكيم له سر قاطع وذو شأن عظيم 
+  مكتوب على ثوب أخضر ومغلف بجلد أزرق
+
 Azeri:
  Zəfər, jaketini də papağını da götür,
  bu axşam hava çox soyuq olacaq.
@ -13,29 +17,27 @@ Czech:
 Danish:
  Høj bly gom vandt fræk sexquiz på wc.

+Esperanto:
+  Eĥoŝanĝo ĉiuĵaŭde.
+
+Ethiopic:
+  ሰማይ አይታረስ ንጉሥ አይከሰስ።
+
 Finnish:
  Törkylempijävongahdus.

-Greek:
-  Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία
+Georgian:
+  გთხოვთ ახლავე გაიაროთ რეგისტრაცია

 German (non-ASCII letters):
  Heizölrückstoßabdämpfung

-Hebrew (does not work yet):
+Greek:
+  Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία
+
+Hebrew:
  נקודה מודגשת

-Icelandic:
-  Kæmi ný öxi hér, ykist þjófum nú bæði 
-  víl og ádrepa.
-
-Irish:
-  d'Ith cat mór dubh na héisc lofa go pras
-
-Russian:
-  В чащах юга жил бы цитрус?
-  Да, но фальшивый экземпляр!
-
 Hungarian:
  Árvíztűrő tükörfúrógép

@ -45,21 +47,39 @@ Hiragana:
  うゐのおくやまけふこえて
  あさきゆめみしゑひもせす

+Icelandic:
+  Kæmi ný öxi hér, ykist þjófum nú bæði 
+  víl og ádrepa.
+
+Irish:
+  d'Ith cat mór dubh na héisc lofa go pras
+
 Katakana:
  イロハニホヘト チリヌルヲ 
  ワカヨタレソ ツネナラム 
  ウヰノオクヤマ ケフコエテ 
  アサキユメミシ ヱヒモセスン

-Georgian:
-  გთხოვთ ახლავე გაიაროთ რეგისტრაცია
+Korean:
+  다람쥐 헌 쳇바퀴에 타고파

-Ethiopic:
-  ሰማይ አይታረስ ንጉሥ አይከሰስ።
+Mayalayam:
+  ബ്qരഹ്മപുരത്തേക്ക്

 Runic:
  ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ

+Russian:
+  В чащах юга жил бы цитрус?
+  Да, но фальшивый экземпляр!
+
+Thai:
+  พ่อขุนรามคำแหงมหาราช
+
+Uyghur:
+  زۆھرەگۈل ئابدۇۋاجىت فرانسىيەنىڭ 
+  پارىژدىكى خېلى بىشەم ئوقۇغۇچى.
+
 Various latin alphabet variants:
  𝐛𝐨𝐥𝐝 𝖋𝖗𝖆𝖐𝖙𝖚𝖗 𝒊𝒕𝒂𝒍𝒊𝒄 𝓼𝓬𝓻𝓲𝓹𝓽
  𝕕𝕠𝕦𝕓𝕝𝕖-𝕤𝕥𝕣𝕦𝕔𝕜 𝚖𝚘𝚗𝚘𝚜𝚙𝚊𝚌𝚎
@ -71,6 +91,26 @@ Emoji:
 	🔀	🔁	🔂	🔃	🔄	🔅	🔆	🔇	
 	😀	😁	😂	😃	😄	😅	😆	😇	

+Line breaking:
+
+  U+000A LINE FEED
+  LINE 1
+  LINE 2
+
+  U+000A LINE FEED
+  LINE 1
  LINE 2
+
+  U+000D CARRIAGE RETURN
+  U+000A LINE FEED
+  LINE 1
+  LINE 2
+
+  U+0085 NEXT LINE
+  LINE 1  LINE 2
+
+  U+2029 PARAGRAPH SEPARATOR
+  LINE 1   LINE 2
+
 Horizontal tabulator:
 	1	2	3	4	5	6
 	t	a	b
@ -107,8 +147,10 @@ Combining characters:
  U+0EB3 LAO VOWEL SIGN AM
    ກຳ

-Thai:
-  พ่อขุนรามคำแหงมหาราช
+Bidirectional text:

-Mayalayam:
-  ബ്qരഹ്മപുരത്തേക്ക് 
+  Year rendered before hebrew name, no LRM:
+  Rivlin (ראובן ריבלין; * 1939 Jerusalem)
+
+  Year rendered after hebrew name with LRM:
+  Rivlin (ראובן ריבלין‎; * 1939 Jerusalem)
--- a/hexfont.lua
+++ b/hexfont.lua
@ -14,6 +14,7 @@ Overflow oder eine Format String Vulnerability zwischen die anderen
 Codezeilen und schreibe das auch nicht dran.
 ]]--

+dofile("bidi.lua")
 dofile("pixelops.lua")
 dofile("unicodedata.lua")
 dofile("utf8.lua")
@ -225,8 +226,9 @@ hexfont.render_line = function(self, text)
   for i = 1, 16 do
      result[i] = {}
   end
-   local codepoints = utf8.text_to_codepoints(text)
-   -- FIXME: only works for LTR, should use UAX #9
+   local codepoints = bidi.get_visual_reordering(
+      utf8.text_to_codepoints(text)
+   )
   for i = 1, #codepoints do
      local codepoint = codepoints[i]
      local bitmap_hex = self[codepoint]
@ -294,7 +296,28 @@ hexfont.render_text = function(self, text)

   local result
   local max_width = 0
-   -- TODO: implement UAX #14
+   -- According to UAX #14, line breaks happen on:
+   -- • U+000A LINE FEED
+   -- • U+000D CARRIAGE RETURN (except as part of CRLF)
+   -- • U+0085 NEXT LINE
+   -- • U+2029 PARAGRAPH SEPARATOR
+   --
+   -- Hack: Replace all of those with LINE FEED.
+   -- FIXME: This makes CRLF into two newlines …
+   local codepoints = utf8.text_to_codepoints(text)
+   for i, codepoint in ipairs(codepoints) do
+      if (
+         0x000D == codepoints[i] or
+         0x0085 == codepoints[i] or
+         0x2029 == codepoints[i]
+      ) then
+         codepoints[i] = 0x000A
+      end
+   end
+   -- FIXME: Code below should only operate on codepoints! Converting
+   -- back and forth makes it needlessly slow – but I do not know how
+   -- to split a table properly to get a single table for each line …
+   text = utf8.codepoints_to_text(codepoints)
   for utf8_line in string.gmatch(text .. "\n", "([^\n]*)\n") do
      local pixels = self:render_line(utf8_line)
      assert( nil ~= pixels )
--- a/utf8.lua
+++ b/utf8.lua
@ -16,6 +16,68 @@ Codezeilen und schreibe das auch nicht dran.

 utf8 = {}

+-- convert a table with codepoints into an UTF-8 string
+-- inspired by <http://news.dieweltistgarnichtso.net/bin/unicode>
+utf8.codepoints_to_text = function(codepoints)
+   assert(
+      "table" == type(codepoints)
+   )
+   for _, codepoint in ipairs(codepoints) do
+      if (
+         0 >= codepoint or
+         1114111 < codepoint or
+         math.floor(codepoint) ~= codepoint
+      ) then
+         error(
+            string.format(
+               "invalid codepoint: %s",
+               codepoint
+            )
+         )
+      end
+   end
+   local codepoints_encoded = {}
+   codepoints_encoded.append = function(...)
+      codepoints_encoded[#codepoints_encoded+1] = string.char(...)
+   end
+   for _, codepoint in ipairs(codepoints) do
+      if codepoint <= 127 then
+         -- one byte encoding
+         codepoints_encoded.append(codepoint)
+      elseif codepoint <= 2048 then
+         -- two bytes encoding
+         codepoints_encoded.append(
+            math.floor(codepoint / 64) + 192,
+            codepoint % 64 + 128
+         )
+      elseif codepoint <= 65535 then
+         -- three bytes encoding
+         codepoints_encoded.append(
+            math.floor(codepoint / 4096) + 224,
+            math.floor(codepoint / 64) % 64 + 128,
+            codepoint % 64 + 128
+         )
+      elseif codepoint <= 1114111 then
+         -- four bytes encoding
+         codepoints_encoded.append(
+            math.floor(codepoint / 262144) + 240,
+            math.floor(codepoint / 4096) % 64 + 128,
+            math.floor(codepoint / 64) % 64 + 128,
+            codepoint % 64 + 128
+         )
+      end
+   end
+   return table.concat(codepoints_encoded)
+end
+
+-- Test one codepoint for each byte length:
+local text = utf8.codepoints_to_text(
+   {119, 240, 9829, 66376}  -- U+0077 U+00F0 U+2665 U+10348
+)
+assert(
+   "wð♥𐍈" == text
+)
+
 -- convert an UTF-8 string into a table with codepoints
 -- inspired by <https://lua-users.org/wiki/LuaUnicode>
 utf8.text_to_codepoints = function(text)
Author	SHA1	Message	Date
Nils Dagsson Moskopp	78a13f7490	+ Modify test for bidi rule W2 so it passes	2023-09-02 15:20:42 +02:00
Nils Dagsson Moskopp	d7ebaec797	* Refactor tests for bidi rules W2, W3, W4	2023-03-26 21:29:32 +02:00
Nils Dagsson Moskopp	732b063dcd	+ Add tests for bidi rule W4	2023-03-26 20:46:38 +02:00
Nils Dagsson Moskopp	fb971deccf	+ Add tests for bidi rule W3	2023-03-26 20:42:55 +02:00
Nils Dagsson Moskopp	58bc44bc7c	+ Add tests for bidi rule W2	2023-03-26 20:41:52 +02:00
Nils Dagsson Moskopp	4d8bea08fd	* Refactor bidi.lua (resolving weak types)	2023-03-22 18:22:17 +01:00
Nils Dagsson Moskopp	82d04cee6f	+ Add more example texts	2023-03-22 03:44:11 +01:00
Nils Dagsson Moskopp	792276cbb5	* Reorder example text	2023-03-22 03:15:33 +01:00
Nils Dagsson Moskopp	759a7d008f	+ Add rudimentary bidirectional rendering support	2023-03-22 02:59:44 +01:00
Nils Dagsson Moskopp	9b984e0d2f	* Refactor newline handling to handle CR, NEL, PSEP	2023-03-21 17:24:44 +01:00