unicode_text/bidi.lua

644 lines
17 KiB
Lua

#!/usr/bin/env lua5.1
--[[
Copyright © 2023 Nils Dagsson Moskopp (erle)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
Dieses Programm hat das Ziel, die Medienkompetenz der Leser zu
steigern. Gelegentlich packe ich sogar einen handfesten Buffer
Overflow oder eine Format String Vulnerability zwischen die anderen
Codezeilen und schreibe das auch nicht dran.
]]--
dofile("unicodedata.lua")
bidi = {}
local get_paragraph_direction = function(codepoints)
-- Find the first character of type L, AL or R …
-- See <https://www.unicode.org/reports/tr9/#P2>
for _, codepoint in ipairs(codepoints) do
if unicodedata[codepoint] then
local bidi_class = unicodedata[codepoint].bidi_class
if (
"L" == bidi_class or -- left-to-right
"R" == bidi_class or -- right-to-left
"AL" == bidi_class -- right-to-left (arabic)
) then
return bidi_class
end
end
end
end
-- See <https://www.unicode.org/reports/tr9/>
bidi.get_visual_reordering = function(codepoints)
-- rule P2
local paragraph_direction =
get_paragraph_direction(codepoints) or "L"
-- rule P3
local paragraph_embedding_level = 0
if (
"R" == paragraph_direction or
"AL" == paragraph_direction
) then
paragraph_embedding_level = 1
end
-- FIXME: Rule X1 to X10 are not implemented yet. This basically
-- means that explicit levels or display directions are ignored.
local run = {}
for i, codepoint in ipairs(codepoints) do
run[i] = {}
run[i].codepoint = codepoint
run[i].bidi_class =
unicodedata[codepoint] and
unicodedata[codepoint].bidi_class or paragraph_direction
run[i].embedding_level = 0
end
-- Hack: This code is almost certainly non-conforming …
-- but it seems to “kinda” work. Someone should fix it!
run = bidi.resolve_weak_types(run, paragraph_direction)
run = bidi.resolve_ni_types(run, paragraph_direction)
run = bidi.resolve_implicit_types(run, paragraph_embedding_level)
run = bidi.reorder_resolved_levels(run, paragraph_embedding_level)
codepoints_reordered = {}
for i, element in ipairs(run) do
codepoints_reordered[i] = element.codepoint
end
return codepoints_reordered
end
bidi.W1 = function(run, sos)
-- Examine each nonspacing mark (NSM) in the isolating run
-- sequence, and change the type of the NSM to Other Neutral
-- if the previous character is an isolate initiator or PDI,
-- and to the type of the previous character otherwise. If
-- the NSM is at the start of the isolating run sequence, it
-- will get the type of sos.
for i = 1, #run do
if "NSM" == run[i].bidi_class then
if 1 == i then
run[i].bidi_class = sos
else
-- FIXME: handle isolate initiator, PDI
run[i].bidi_class = run[i-1].bidi_class
end
end
end
return run
end
local W2 = function(run, sos)
-- sos is the text ordering type assigned to the virtual position
-- before an isolating run sequence
assert(
"L" == sos or
"R" == sos
)
for i = 1, #run do
if "EN" == run[i].bidi_class then
-- Search backward from each instance of a European number
-- until the first strong type (R, L, AL, or sos) is found.
local previous_strong_bidi_class = nil
local j = i
repeat
j = j - 1
local previous_bidi_class = run[j].bidi_class
if (
"L" == previous_bidi_class or -- left-to-right
"R" == previous_bidi_class or -- right-to-left
"AL" == previous_bidi_class or -- right-to-left (arabic)
sos == previous_bidi_class
) then
previous_strong_bidi_class = previous_bidi_class
end
until(
nil ~= previous_strong_bidi_class or
1 == j
)
-- If an AL is found, change the type of the European number
-- to Arabic number.
if "AL" == previous_strong_bidi_class then
run[i].bidi_class = "AN"
end
end
end
return run
end
dump_table = function(table_, indentation)
assert( "table" == type(table_) )
local indentation = indentation or 0
assert( "number" == type(indentation) )
local result = {}
for key, value in pairs(table_) do
local prefix = "\n" .. (" "):rep(indentation) .. key .. ": "
if "table" == type(value) then
result[#result + 1] = prefix .. dump_table(value, indentation + 1)
else
result[#result + 1] = prefix .. tostring(value)
end
end
return table.concat(result)
end
local test_rule = function(
description,
rule,
test_input,
expected_output,
...
)
assert( "string" == type(description) )
assert( "function" == type(rule) )
assert( "table" == type(test_input) )
assert( "table" == type(expected_output) )
local test_output = rule(test_input, ...)
for i = 1, #test_input do
assert(
test_output[i].bidi_class == expected_output[i].bidi_class,
description ..
dump_table(test_output)
)
end
end
test_rule(
"Test W2: AL EN → AL AN",
W2,
{
{ ["bidi_class"] = "AL" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "AL" },
{ ["bidi_class"] = "AN" },
},
"L"
)
test_rule(
"Test W2: AL NI EN → AL NI AN",
W2,
{
{ ["bidi_class"] = "AL" },
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "AL" },
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "AN" },
},
"L"
)
test_rule(
"Test W2: sos NI EN → sos NI EN (sos = L)",
W2,
{
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
"L"
)
test_rule(
"Test W2: sos NI EN → sos NI EN (sos = R)",
W2,
{
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
"R"
)
test_rule(
"Test W2: L NI EN → L NI EN",
W2,
{
{ ["bidi_class"] = "L" },
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "L" },
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
"L"
)
test_rule(
"Test W2: R NI EN → R NI EN",
W2,
{
{ ["bidi_class"] = "R" },
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "R" },
{ ["bidi_class"] = "NI" },
{ ["bidi_class"] = "EN" },
},
"L"
)
local W3 = function(run)
-- Change al ALs to R.
for i = 1, #run do
if "AL" == run[i].bidi_class then
run[i].bidi_class = "R"
end
end
return run
end
test_rule(
"Test W3: AL AL AL → R R R",
W3,
{
{ ["bidi_class"] = "AL" },
{ ["bidi_class"] = "AL" },
{ ["bidi_class"] = "AL" },
},
{
{ ["bidi_class"] = "R" },
{ ["bidi_class"] = "R" },
{ ["bidi_class"] = "R" },
}
)
local W4 = function(run)
for i = 1, #run do
if "ES" == run[i].bidi_class then
-- A single European separator between two European numbers
-- changes to a European number.
if (
1 < i and
#run > i and
"EN" == run[i-1].bidi_class and
"EN" == run[i+1].bidi_class
) then
run[i].bidi_class = "EN"
end
end
if "CS" == run[i].bidi_class then
-- A single common separator between two numbers of the same
-- type changes to that type.
if (
1 < i and
#run > i and
"EN" == run[i-1].bidi_class and
"EN" == run[i+1].bidi_class
) then
run[i].bidi_class = "EN"
end
if (
1 < i and
#run > i and
"AN" == run[i-1].bidi_class and
"AN" == run[i+1].bidi_class
) then
run[i].bidi_class = "AN"
end
end
end
return run
end
test_rule(
"Test·W4:·EN·ES·EN·→·EN·EN·EN",
W4,
{
{ ["bidi_class"] = "EN" },
{ ["bidi_class"] = "ES" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "EN" },
{ ["bidi_class"] = "EN" },
{ ["bidi_class"] = "EN" },
}
)
test_rule(
"Test·W4:·EN·CS·EN·→·EN·EN·EN",
W4,
{
{ ["bidi_class"] = "EN" },
{ ["bidi_class"] = "CS" },
{ ["bidi_class"] = "EN" },
},
{
{ ["bidi_class"] = "EN" },
{ ["bidi_class"] = "EN" },
{ ["bidi_class"] = "EN" },
}
)
test_rule(
"Test·W4:·AN·CS·AN·→·AN·AN·AN",
W4,
{
{ ["bidi_class"] = "AN" },
{ ["bidi_class"] = "CS" },
{ ["bidi_class"] = "AN" },
},
{
{ ["bidi_class"] = "AN" },
{ ["bidi_class"] = "AN" },
{ ["bidi_class"] = "AN" },
}
)
bidi.W5 = function(run)
for i = 1, #run do
if "ET" == run[i].bidi_class then
-- A sequence of European terminators adjacent to European
-- numbers changes to all European numbers.
if (
1 < i and
"EN" == run[i-1].bidi_class
) or (
#run > i and
"EN" == run[i+1].bidi_class
) then
run[i].bidi_class = "EN"
end
end
end
return run
end
bidi.W6 = function(run)
for i = 1, #run do
if (
"ES" == run[i].bidi_class or
"ET" == run[i].bidi_class or
"CS" == run[i].bidi_class
) then
-- Otherwise, separators and terminators change to Other
-- Neutral.
run[i].bidi_class = "ON"
end
end
return run
end
bidi.W7 = function(run, sos)
for i = 1, #run do
if "EN" == run[i].bidi_class then
-- Search backward from each instance of a European number
-- until the first strong type (R, L, or sos) is found. If an
-- L is found, then change the type of the European number to
-- L.
local previous_strong_bidi_class = nil
local j = i
repeat
local previous_bidi_class = run[j].bidi_class
if (
"L" == previous_bidi_class or -- left-to-right
"R" == previous_bidi_class or -- right-to-left
sos == previous_bidi_class
) then
previous_strong_bidi_class = previous_bidi_class
end
j = j - 1
until(
nil ~= previous_strong_bidi_class or
0 == j
)
if "L" == previous_strong_bidi_class then
run[i].bidi_class = "L"
end
end
end
return run
end
bidi.resolve_weak_types = function(run, sos)
run = bidi.W1(run, sos)
run = W2(run, sos)
run = W3(run)
run = W4(run)
run = bidi.W5(run)
run = bidi.W6(run)
run = bidi.W7(run, sos)
return run
end
bidi.resolve_ni_types = function(run, embedding_direction)
for i, element in ipairs(run) do
-- N0
-- FIXME: Process bracket pairs!
-- N1
if (
"B" == run[i].bidi_class or
"S" == run[i].bidi_class or
"WS" == run[i].bidi_class or
"ON" == run[i].bidi_class or
"FSI" == run[i].bidi_class or
"LRI" == run[i].bidi_class or
"RLI" == run[i].bidi_class or
"PDI" == run[i].bidi_class
) then
-- A sequence of NIs takes the direction of the surrounding
-- strong text if the text on both sides has the same
-- direction.
local previous_direction = nil
local j = i
repeat
local previous_bidi_class = run[j].bidi_class
if (
"L" == previous_bidi_class or -- left-to-right
"R" == previous_bidi_class -- right-to-left
) then
previous_direction = previous_bidi_class
end
j = j - 1
until(
nil ~= previous_direction or
0 == j
)
local next_direction = nil
local j = i
repeat
local next_bidi_class = run[j].bidi_class
if (
"L" == next_bidi_class or -- left-to-right
"R" == next_bidi_class -- right-to-left
) then
next_direction = next_bidi_class
end
j = j + 1
until(
nil ~= next_direction or
#run + 1 == j
)
if (
1 < i and
#run > i and
"L" == previous_direction and
"L" == next_direction
) then
run[i].bidi_class = "L"
elseif (
-- European and Arabic numbers act as if they were R in
-- terms of their influence on NIs.
1 < i and
#run > i and
(
"R" == previous_direction or
"EN" == previous_direction or
"AN" == previous_direction
) and (
"R" == next_direction or
"EN" == next_direction or
"AN" == next_direction
)
) then
run[i].bidi_class = "R"
-- N2
else
-- Any remaining NIs take the embedding direction.
run[i].bidi_class = embedding_direction
end
end
end
return run
end
bidi.resolve_implicit_types = function(run, embedding_level)
for i, element in ipairs(run) do
-- I1
if 0 == embedding_level % 2 then
-- For all characters with an even (left-to-right)
-- embedding level, those of type R go up one level and those
-- of type AN or EN go up two levels.
if "R" == run[i].bidi_class then
run[i].embedding_level = run[i].embedding_level + 1
elseif (
"AN" == run[i].bidi_class or
"EN" == run[i].bidi_class
) then
run[i].embedding_level = run[i].embedding_level + 2
end
-- I2
else
-- For all characters with an odd (right-to-left) embedding
-- level, those of type L, EN or AN go up one level.
if (
"L" == run[i].bidi_class or
"EN" == run[i].bidi_class or
"AN" == run[i].bidi_class
) then
run[i].embedding_level = run[i].embedding_level + 1
end
end
end
return run
end
-- reverse any sequences at minimum_embedding_level or higher
bidi.reverse_sequences = function(run, minimum_embedding_level)
local sequence_start
local sequence_end
for i = 1, #run do
if (
minimum_embedding_level <= run[i].embedding_level and
nil == sequence_start
) then
-- found the start of a sequence
sequence_start = i
elseif (
minimum_embedding_level > run[i].embedding_level and
nil ~= sequence_start
) then
-- found the end of a sequence
sequence_end = i
end
if (
nil ~= sequence_start and
nil ~= sequence_end
) then
-- extract sequence
local sequence = {}
for j = 1, sequence_end - sequence_start do
sequence[#sequence+1] = run[sequence_start + j - 1]
end
-- insert sequence reversed
for k = 1, #sequence do
run[sequence_start + k - 1] = sequence[#sequence - k + 1]
end
sequence_start = nil
sequence_end = nil
sequence = {}
end
end
return run
end
bidi.reorder_resolved_levels = function(run, paragraph_embedding_level)
-- L1
-- FIXME: Reset some embedding levels to paragraph embedding level!
-- L2
-- From the highest level found in the text to the lowest odd level
-- on each line, including intermediate levels not actually present
-- in the text, reverse any contiguous sequence of characters that
-- are at that level or higher.
local max_embedding_level = 0
for _, element in ipairs(run) do
if max_embedding_level < element.embedding_level then
max_embedding_level = element.embedding_level
end
end
assert(
"number" == type(max_embedding_level)
)
for minimum_embedding_level = max_embedding_level, 1, -1 do
run = bidi.reverse_sequences(run, minimum_embedding_level)
end
-- L3
-- FIXME: Fix combining marks applied to right-to-left characters.
-- L4
-- FIXME: Replace characters by mirrored glyphs.
return run
end
--[[
dofile("utf8.lua")
local text = "Reuben Rivlin (ראובן ריבלין; * 1939 in Jerusalem)"
local text_reordered = utf8.codepoints_to_text(
bidi.get_visual_reordering(
utf8.text_to_codepoints(
text
)
)
)
print(text)
print(text_reordered)
--]]