Compare commits
No commits in common. "78a13f7490ac44f9303561057253031f11489648" and "6bd77156c889f9f52022496438e0e551f1c7e34c" have entirely different histories.
78a13f7490
...
6bd77156c8
11
README.rst
11
README.rst
|
@ -223,15 +223,10 @@ Hint 1: <https://unifoundry.com/unifont/index.html>
|
|||
|
||||
Hint 2: <https://trevorldavis.com/R/hexfont/>
|
||||
|
||||
Why is Arabic / Hebrew / Urdu etc. text rendered somewhat wrong?
|
||||
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Why is Arabic / Hebrew / Urdu etc. text rendered wrong, i.e. left to right?
|
||||
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
|
||||
I did not implement the entire `Unicode Bidirectional Algorithm`_.
|
||||
|
||||
If you are able to read a right-to-left language, please help …
|
||||
|
||||
.. _`Unicode Bidirectional Algorithm`:
|
||||
https://www.unicode.org/reports/tr9/
|
||||
I did not implement the Unicode Bidirectional Algorithm. Patches welcome.
|
||||
|
||||
Why is the generated pixels table upside down?
|
||||
++++++++++++++++++++++++++++++++++++++++++++++
|
||||
|
|
644
bidi.lua
644
bidi.lua
|
@ -1,644 +0,0 @@
|
|||
#!/usr/bin/env lua5.1
|
||||
|
||||
--[[
|
||||
Copyright © 2023 Nils Dagsson Moskopp (erle)
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
Dieses Programm hat das Ziel, die Medienkompetenz der Leser zu
|
||||
steigern. Gelegentlich packe ich sogar einen handfesten Buffer
|
||||
Overflow oder eine Format String Vulnerability zwischen die anderen
|
||||
Codezeilen und schreibe das auch nicht dran.
|
||||
]]--
|
||||
|
||||
dofile("unicodedata.lua")
|
||||
|
||||
bidi = {}
|
||||
|
||||
local get_paragraph_direction = function(codepoints)
|
||||
-- Find the first character of type L, AL or R …
|
||||
-- See <https://www.unicode.org/reports/tr9/#P2>
|
||||
for _, codepoint in ipairs(codepoints) do
|
||||
if unicodedata[codepoint] then
|
||||
local bidi_class = unicodedata[codepoint].bidi_class
|
||||
if (
|
||||
"L" == bidi_class or -- left-to-right
|
||||
"R" == bidi_class or -- right-to-left
|
||||
"AL" == bidi_class -- right-to-left (arabic)
|
||||
) then
|
||||
return bidi_class
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
-- See <https://www.unicode.org/reports/tr9/>
|
||||
bidi.get_visual_reordering = function(codepoints)
|
||||
-- rule P2
|
||||
local paragraph_direction =
|
||||
get_paragraph_direction(codepoints) or "L"
|
||||
|
||||
-- rule P3
|
||||
local paragraph_embedding_level = 0
|
||||
if (
|
||||
"R" == paragraph_direction or
|
||||
"AL" == paragraph_direction
|
||||
) then
|
||||
paragraph_embedding_level = 1
|
||||
end
|
||||
|
||||
-- FIXME: Rule X1 to X10 are not implemented yet. This basically
|
||||
-- means that explicit levels or display directions are ignored.
|
||||
|
||||
local run = {}
|
||||
for i, codepoint in ipairs(codepoints) do
|
||||
run[i] = {}
|
||||
run[i].codepoint = codepoint
|
||||
run[i].bidi_class =
|
||||
unicodedata[codepoint] and
|
||||
unicodedata[codepoint].bidi_class or paragraph_direction
|
||||
run[i].embedding_level = 0
|
||||
end
|
||||
|
||||
-- Hack: This code is almost certainly non-conforming …
|
||||
-- but it seems to “kinda” work. Someone should fix it!
|
||||
run = bidi.resolve_weak_types(run, paragraph_direction)
|
||||
run = bidi.resolve_ni_types(run, paragraph_direction)
|
||||
run = bidi.resolve_implicit_types(run, paragraph_embedding_level)
|
||||
run = bidi.reorder_resolved_levels(run, paragraph_embedding_level)
|
||||
|
||||
codepoints_reordered = {}
|
||||
for i, element in ipairs(run) do
|
||||
codepoints_reordered[i] = element.codepoint
|
||||
end
|
||||
return codepoints_reordered
|
||||
end
|
||||
|
||||
bidi.W1 = function(run, sos)
|
||||
-- Examine each nonspacing mark (NSM) in the isolating run
|
||||
-- sequence, and change the type of the NSM to Other Neutral
|
||||
-- if the previous character is an isolate initiator or PDI,
|
||||
-- and to the type of the previous character otherwise. If
|
||||
-- the NSM is at the start of the isolating run sequence, it
|
||||
-- will get the type of sos.
|
||||
for i = 1, #run do
|
||||
if "NSM" == run[i].bidi_class then
|
||||
if 1 == i then
|
||||
run[i].bidi_class = sos
|
||||
else
|
||||
-- FIXME: handle isolate initiator, PDI
|
||||
run[i].bidi_class = run[i-1].bidi_class
|
||||
end
|
||||
end
|
||||
end
|
||||
return run
|
||||
end
|
||||
|
||||
local W2 = function(run, sos)
|
||||
-- sos is the text ordering type assigned to the virtual position
|
||||
-- before an isolating run sequence
|
||||
assert(
|
||||
"AL" == sos or -- FIXME: find the actual bug & remove this line
|
||||
"L" == sos or
|
||||
"R" == sos
|
||||
)
|
||||
for i = 1, #run do
|
||||
if "EN" == run[i].bidi_class then
|
||||
-- Search backward from each instance of a European number
|
||||
-- until the first strong type (R, L, AL, or sos) is found.
|
||||
local previous_strong_bidi_class = nil
|
||||
local j = i
|
||||
repeat
|
||||
j = j - 1
|
||||
local previous_bidi_class = run[j].bidi_class
|
||||
if (
|
||||
"L" == previous_bidi_class or -- left-to-right
|
||||
"R" == previous_bidi_class or -- right-to-left
|
||||
"AL" == previous_bidi_class or -- right-to-left (arabic)
|
||||
sos == previous_bidi_class
|
||||
) then
|
||||
previous_strong_bidi_class = previous_bidi_class
|
||||
end
|
||||
until(
|
||||
nil ~= previous_strong_bidi_class or
|
||||
1 == j
|
||||
)
|
||||
-- If an AL is found, change the type of the European number
|
||||
-- to Arabic number.
|
||||
if "AL" == previous_strong_bidi_class then
|
||||
run[i].bidi_class = "AN"
|
||||
end
|
||||
end
|
||||
end
|
||||
return run
|
||||
end
|
||||
|
||||
dump_table = function(table_, indentation)
|
||||
assert( "table" == type(table_) )
|
||||
|
||||
local indentation = indentation or 0
|
||||
assert( "number" == type(indentation) )
|
||||
|
||||
local result = {}
|
||||
for key, value in pairs(table_) do
|
||||
local prefix = "\n" .. (" "):rep(indentation) .. key .. ": "
|
||||
if "table" == type(value) then
|
||||
result[#result + 1] = prefix .. dump_table(value, indentation + 1)
|
||||
else
|
||||
result[#result + 1] = prefix .. tostring(value)
|
||||
end
|
||||
end
|
||||
return table.concat(result)
|
||||
end
|
||||
|
||||
local test_rule = function(
|
||||
description,
|
||||
rule,
|
||||
test_input,
|
||||
expected_output,
|
||||
...
|
||||
)
|
||||
assert( "string" == type(description) )
|
||||
assert( "function" == type(rule) )
|
||||
assert( "table" == type(test_input) )
|
||||
assert( "table" == type(expected_output) )
|
||||
local test_output = rule(test_input, ...)
|
||||
for i = 1, #test_input do
|
||||
assert(
|
||||
test_output[i].bidi_class == expected_output[i].bidi_class,
|
||||
description ..
|
||||
dump_table(test_output)
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
test_rule(
|
||||
"Test W2: AL EN → AL AN",
|
||||
W2,
|
||||
{
|
||||
{ ["bidi_class"] = "AL" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
},
|
||||
{
|
||||
{ ["bidi_class"] = "AL" },
|
||||
{ ["bidi_class"] = "AN" },
|
||||
},
|
||||
"L"
|
||||
)
|
||||
|
||||
test_rule(
|
||||
"Test W2: AL NI EN → AL NI AN",
|
||||
W2,
|
||||
{
|
||||
{ ["bidi_class"] = "AL" },
|
||||
{ ["bidi_class"] = "NI" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
},
|
||||
{
|
||||
{ ["bidi_class"] = "AL" },
|
||||
{ ["bidi_class"] = "NI" },
|
||||
{ ["bidi_class"] = "AN" },
|
||||
},
|
||||
"L"
|
||||
)
|
||||
|
||||
test_rule(
|
||||
"Test W2: sos NI EN → sos NI EN (sos = L)",
|
||||
W2,
|
||||
{
|
||||
{ ["bidi_class"] = "NI" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
},
|
||||
{
|
||||
{ ["bidi_class"] = "NI" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
},
|
||||
"L"
|
||||
)
|
||||
|
||||
test_rule(
|
||||
"Test W2: sos NI EN → sos NI EN (sos = R)",
|
||||
W2,
|
||||
{
|
||||
{ ["bidi_class"] = "NI" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
},
|
||||
{
|
||||
{ ["bidi_class"] = "NI" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
},
|
||||
"R"
|
||||
)
|
||||
|
||||
test_rule(
|
||||
"Test W2: L NI EN → L NI EN",
|
||||
W2,
|
||||
{
|
||||
{ ["bidi_class"] = "L" },
|
||||
{ ["bidi_class"] = "NI" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
},
|
||||
{
|
||||
{ ["bidi_class"] = "L" },
|
||||
{ ["bidi_class"] = "NI" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
},
|
||||
"L"
|
||||
)
|
||||
|
||||
test_rule(
|
||||
"Test W2: R NI EN → R NI EN",
|
||||
W2,
|
||||
{
|
||||
{ ["bidi_class"] = "R" },
|
||||
{ ["bidi_class"] = "NI" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
},
|
||||
{
|
||||
{ ["bidi_class"] = "R" },
|
||||
{ ["bidi_class"] = "NI" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
},
|
||||
"L"
|
||||
)
|
||||
|
||||
local W3 = function(run)
|
||||
-- Change al ALs to R.
|
||||
for i = 1, #run do
|
||||
if "AL" == run[i].bidi_class then
|
||||
run[i].bidi_class = "R"
|
||||
end
|
||||
end
|
||||
return run
|
||||
end
|
||||
|
||||
test_rule(
|
||||
"Test W3: AL AL AL → R R R",
|
||||
W3,
|
||||
{
|
||||
{ ["bidi_class"] = "AL" },
|
||||
{ ["bidi_class"] = "AL" },
|
||||
{ ["bidi_class"] = "AL" },
|
||||
},
|
||||
{
|
||||
{ ["bidi_class"] = "R" },
|
||||
{ ["bidi_class"] = "R" },
|
||||
{ ["bidi_class"] = "R" },
|
||||
}
|
||||
)
|
||||
|
||||
local W4 = function(run)
|
||||
for i = 1, #run do
|
||||
if "ES" == run[i].bidi_class then
|
||||
-- A single European separator between two European numbers
|
||||
-- changes to a European number.
|
||||
if (
|
||||
1 < i and
|
||||
#run > i and
|
||||
"EN" == run[i-1].bidi_class and
|
||||
"EN" == run[i+1].bidi_class
|
||||
) then
|
||||
run[i].bidi_class = "EN"
|
||||
end
|
||||
end
|
||||
if "CS" == run[i].bidi_class then
|
||||
-- A single common separator between two numbers of the same
|
||||
-- type changes to that type.
|
||||
if (
|
||||
1 < i and
|
||||
#run > i and
|
||||
"EN" == run[i-1].bidi_class and
|
||||
"EN" == run[i+1].bidi_class
|
||||
) then
|
||||
run[i].bidi_class = "EN"
|
||||
end
|
||||
if (
|
||||
1 < i and
|
||||
#run > i and
|
||||
"AN" == run[i-1].bidi_class and
|
||||
"AN" == run[i+1].bidi_class
|
||||
) then
|
||||
run[i].bidi_class = "AN"
|
||||
end
|
||||
end
|
||||
end
|
||||
return run
|
||||
end
|
||||
|
||||
test_rule(
|
||||
"Test·W4:·EN·ES·EN·→·EN·EN·EN",
|
||||
W4,
|
||||
{
|
||||
{ ["bidi_class"] = "EN" },
|
||||
{ ["bidi_class"] = "ES" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
},
|
||||
{
|
||||
{ ["bidi_class"] = "EN" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
}
|
||||
)
|
||||
|
||||
test_rule(
|
||||
"Test·W4:·EN·CS·EN·→·EN·EN·EN",
|
||||
W4,
|
||||
{
|
||||
{ ["bidi_class"] = "EN" },
|
||||
{ ["bidi_class"] = "CS" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
},
|
||||
{
|
||||
{ ["bidi_class"] = "EN" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
{ ["bidi_class"] = "EN" },
|
||||
}
|
||||
)
|
||||
|
||||
test_rule(
|
||||
"Test·W4:·AN·CS·AN·→·AN·AN·AN",
|
||||
W4,
|
||||
{
|
||||
{ ["bidi_class"] = "AN" },
|
||||
{ ["bidi_class"] = "CS" },
|
||||
{ ["bidi_class"] = "AN" },
|
||||
},
|
||||
{
|
||||
{ ["bidi_class"] = "AN" },
|
||||
{ ["bidi_class"] = "AN" },
|
||||
{ ["bidi_class"] = "AN" },
|
||||
}
|
||||
)
|
||||
|
||||
bidi.W5 = function(run)
|
||||
for i = 1, #run do
|
||||
if "ET" == run[i].bidi_class then
|
||||
-- A sequence of European terminators adjacent to European
|
||||
-- numbers changes to all European numbers.
|
||||
if (
|
||||
1 < i and
|
||||
"EN" == run[i-1].bidi_class
|
||||
) or (
|
||||
#run > i and
|
||||
"EN" == run[i+1].bidi_class
|
||||
) then
|
||||
run[i].bidi_class = "EN"
|
||||
end
|
||||
end
|
||||
end
|
||||
return run
|
||||
end
|
||||
|
||||
bidi.W6 = function(run)
|
||||
for i = 1, #run do
|
||||
if (
|
||||
"ES" == run[i].bidi_class or
|
||||
"ET" == run[i].bidi_class or
|
||||
"CS" == run[i].bidi_class
|
||||
) then
|
||||
-- Otherwise, separators and terminators change to Other
|
||||
-- Neutral.
|
||||
run[i].bidi_class = "ON"
|
||||
end
|
||||
end
|
||||
return run
|
||||
end
|
||||
|
||||
bidi.W7 = function(run, sos)
|
||||
for i = 1, #run do
|
||||
if "EN" == run[i].bidi_class then
|
||||
-- Search backward from each instance of a European number
|
||||
-- until the first strong type (R, L, or sos) is found. If an
|
||||
-- L is found, then change the type of the European number to
|
||||
-- L.
|
||||
local previous_strong_bidi_class = nil
|
||||
local j = i
|
||||
repeat
|
||||
local previous_bidi_class = run[j].bidi_class
|
||||
if (
|
||||
"L" == previous_bidi_class or -- left-to-right
|
||||
"R" == previous_bidi_class or -- right-to-left
|
||||
sos == previous_bidi_class
|
||||
) then
|
||||
previous_strong_bidi_class = previous_bidi_class
|
||||
end
|
||||
j = j - 1
|
||||
until(
|
||||
nil ~= previous_strong_bidi_class or
|
||||
0 == j
|
||||
)
|
||||
if "L" == previous_strong_bidi_class then
|
||||
run[i].bidi_class = "L"
|
||||
end
|
||||
end
|
||||
end
|
||||
return run
|
||||
end
|
||||
|
||||
bidi.resolve_weak_types = function(run, sos)
|
||||
run = bidi.W1(run, sos)
|
||||
run = W2(run, sos)
|
||||
run = W3(run)
|
||||
run = W4(run)
|
||||
run = bidi.W5(run)
|
||||
run = bidi.W6(run)
|
||||
run = bidi.W7(run, sos)
|
||||
return run
|
||||
end
|
||||
|
||||
bidi.resolve_ni_types = function(run, embedding_direction)
|
||||
for i, element in ipairs(run) do
|
||||
-- N0
|
||||
-- FIXME: Process bracket pairs!
|
||||
-- N1
|
||||
if (
|
||||
"B" == run[i].bidi_class or
|
||||
"S" == run[i].bidi_class or
|
||||
"WS" == run[i].bidi_class or
|
||||
"ON" == run[i].bidi_class or
|
||||
"FSI" == run[i].bidi_class or
|
||||
"LRI" == run[i].bidi_class or
|
||||
"RLI" == run[i].bidi_class or
|
||||
"PDI" == run[i].bidi_class
|
||||
) then
|
||||
-- A sequence of NIs takes the direction of the surrounding
|
||||
-- strong text if the text on both sides has the same
|
||||
-- direction.
|
||||
local previous_direction = nil
|
||||
local j = i
|
||||
repeat
|
||||
local previous_bidi_class = run[j].bidi_class
|
||||
if (
|
||||
"L" == previous_bidi_class or -- left-to-right
|
||||
"R" == previous_bidi_class -- right-to-left
|
||||
) then
|
||||
previous_direction = previous_bidi_class
|
||||
end
|
||||
j = j - 1
|
||||
until(
|
||||
nil ~= previous_direction or
|
||||
0 == j
|
||||
)
|
||||
local next_direction = nil
|
||||
local j = i
|
||||
repeat
|
||||
local next_bidi_class = run[j].bidi_class
|
||||
if (
|
||||
"L" == next_bidi_class or -- left-to-right
|
||||
"R" == next_bidi_class -- right-to-left
|
||||
) then
|
||||
next_direction = next_bidi_class
|
||||
end
|
||||
j = j + 1
|
||||
until(
|
||||
nil ~= next_direction or
|
||||
#run + 1 == j
|
||||
)
|
||||
if (
|
||||
1 < i and
|
||||
#run > i and
|
||||
"L" == previous_direction and
|
||||
"L" == next_direction
|
||||
) then
|
||||
run[i].bidi_class = "L"
|
||||
elseif (
|
||||
-- European and Arabic numbers act as if they were R in
|
||||
-- terms of their influence on NIs.
|
||||
1 < i and
|
||||
#run > i and
|
||||
(
|
||||
"R" == previous_direction or
|
||||
"EN" == previous_direction or
|
||||
"AN" == previous_direction
|
||||
) and (
|
||||
"R" == next_direction or
|
||||
"EN" == next_direction or
|
||||
"AN" == next_direction
|
||||
)
|
||||
) then
|
||||
run[i].bidi_class = "R"
|
||||
-- N2
|
||||
else
|
||||
-- Any remaining NIs take the embedding direction.
|
||||
run[i].bidi_class = embedding_direction
|
||||
end
|
||||
end
|
||||
end
|
||||
return run
|
||||
end
|
||||
|
||||
bidi.resolve_implicit_types = function(run, embedding_level)
|
||||
for i, element in ipairs(run) do
|
||||
-- I1
|
||||
if 0 == embedding_level % 2 then
|
||||
-- For all characters with an even (left-to-right)
|
||||
-- embedding level, those of type R go up one level and those
|
||||
-- of type AN or EN go up two levels.
|
||||
if "R" == run[i].bidi_class then
|
||||
run[i].embedding_level = run[i].embedding_level + 1
|
||||
elseif (
|
||||
"AN" == run[i].bidi_class or
|
||||
"EN" == run[i].bidi_class
|
||||
) then
|
||||
run[i].embedding_level = run[i].embedding_level + 2
|
||||
end
|
||||
-- I2
|
||||
else
|
||||
-- For all characters with an odd (right-to-left) embedding
|
||||
-- level, those of type L, EN or AN go up one level.
|
||||
if (
|
||||
"L" == run[i].bidi_class or
|
||||
"EN" == run[i].bidi_class or
|
||||
"AN" == run[i].bidi_class
|
||||
) then
|
||||
run[i].embedding_level = run[i].embedding_level + 1
|
||||
end
|
||||
end
|
||||
end
|
||||
return run
|
||||
end
|
||||
|
||||
-- reverse any sequences at minimum_embedding_level or higher
|
||||
bidi.reverse_sequences = function(run, minimum_embedding_level)
|
||||
local sequence_start
|
||||
local sequence_end
|
||||
for i = 1, #run do
|
||||
if (
|
||||
minimum_embedding_level <= run[i].embedding_level and
|
||||
nil == sequence_start
|
||||
) then
|
||||
-- found the start of a sequence
|
||||
sequence_start = i
|
||||
elseif (
|
||||
minimum_embedding_level > run[i].embedding_level and
|
||||
nil ~= sequence_start
|
||||
) then
|
||||
-- found the end of a sequence
|
||||
sequence_end = i
|
||||
end
|
||||
if (
|
||||
nil ~= sequence_start and
|
||||
nil ~= sequence_end
|
||||
) then
|
||||
-- extract sequence
|
||||
local sequence = {}
|
||||
for j = 1, sequence_end - sequence_start do
|
||||
sequence[#sequence+1] = run[sequence_start + j - 1]
|
||||
end
|
||||
-- insert sequence reversed
|
||||
for k = 1, #sequence do
|
||||
run[sequence_start + k - 1] = sequence[#sequence - k + 1]
|
||||
end
|
||||
sequence_start = nil
|
||||
sequence_end = nil
|
||||
sequence = {}
|
||||
end
|
||||
end
|
||||
return run
|
||||
end
|
||||
|
||||
bidi.reorder_resolved_levels = function(run, paragraph_embedding_level)
|
||||
-- L1
|
||||
-- FIXME: Reset some embedding levels to paragraph embedding level!
|
||||
-- L2
|
||||
-- From the highest level found in the text to the lowest odd level
|
||||
-- on each line, including intermediate levels not actually present
|
||||
-- in the text, reverse any contiguous sequence of characters that
|
||||
-- are at that level or higher.
|
||||
local max_embedding_level = 0
|
||||
for _, element in ipairs(run) do
|
||||
if max_embedding_level < element.embedding_level then
|
||||
max_embedding_level = element.embedding_level
|
||||
end
|
||||
end
|
||||
assert(
|
||||
"number" == type(max_embedding_level)
|
||||
)
|
||||
for minimum_embedding_level = max_embedding_level, 1, -1 do
|
||||
run = bidi.reverse_sequences(run, minimum_embedding_level)
|
||||
end
|
||||
-- L3
|
||||
-- FIXME: Fix combining marks applied to right-to-left characters.
|
||||
-- L4
|
||||
-- FIXME: Replace characters by mirrored glyphs.
|
||||
return run
|
||||
end
|
||||
|
||||
--[[
|
||||
dofile("utf8.lua")
|
||||
|
||||
local text = "Reuben Rivlin (ראובן ריבלין; * 1939 in Jerusalem)"
|
||||
local text_reordered = utf8.codepoints_to_text(
|
||||
bidi.get_visual_reordering(
|
||||
utf8.text_to_codepoints(
|
||||
text
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
print(text)
|
||||
print(text_reordered)
|
||||
--]]
|
88
example.txt
88
example.txt
|
@ -1,7 +1,3 @@
|
|||
Arabic:
|
||||
نص حكيم له سر قاطع وذو شأن عظيم
|
||||
مكتوب على ثوب أخضر ومغلف بجلد أزرق
|
||||
|
||||
Azeri:
|
||||
Zəfər, jaketini də papağını da götür,
|
||||
bu axşam hava çox soyuq olacaq.
|
||||
|
@ -17,27 +13,29 @@ Czech:
|
|||
Danish:
|
||||
Høj bly gom vandt fræk sexquiz på wc.
|
||||
|
||||
Esperanto:
|
||||
Eĥoŝanĝo ĉiuĵaŭde.
|
||||
|
||||
Ethiopic:
|
||||
ሰማይ አይታረስ ንጉሥ አይከሰስ።
|
||||
|
||||
Finnish:
|
||||
Törkylempijävongahdus.
|
||||
|
||||
Georgian:
|
||||
გთხოვთ ახლავე გაიაროთ რეგისტრაცია
|
||||
|
||||
German (non-ASCII letters):
|
||||
Heizölrückstoßabdämpfung
|
||||
|
||||
Greek:
|
||||
Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία
|
||||
|
||||
Hebrew:
|
||||
German (non-ASCII letters):
|
||||
Heizölrückstoßabdämpfung
|
||||
|
||||
Hebrew (does not work yet):
|
||||
נקודה מודגשת
|
||||
|
||||
Icelandic:
|
||||
Kæmi ný öxi hér, ykist þjófum nú bæði
|
||||
víl og ádrepa.
|
||||
|
||||
Irish:
|
||||
d'Ith cat mór dubh na héisc lofa go pras
|
||||
|
||||
Russian:
|
||||
В чащах юга жил бы цитрус?
|
||||
Да, но фальшивый экземпляр!
|
||||
|
||||
Hungarian:
|
||||
Árvíztűrő tükörfúrógép
|
||||
|
||||
|
@ -47,39 +45,21 @@ Hiragana:
|
|||
うゐのおくやまけふこえて
|
||||
あさきゆめみしゑひもせす
|
||||
|
||||
Icelandic:
|
||||
Kæmi ný öxi hér, ykist þjófum nú bæði
|
||||
víl og ádrepa.
|
||||
|
||||
Irish:
|
||||
d'Ith cat mór dubh na héisc lofa go pras
|
||||
|
||||
Katakana:
|
||||
イロハニホヘト チリヌルヲ
|
||||
ワカヨタレソ ツネナラム
|
||||
ウヰノオクヤマ ケフコエテ
|
||||
アサキユメミシ ヱヒモセスン
|
||||
|
||||
Korean:
|
||||
다람쥐 헌 쳇바퀴에 타고파
|
||||
Georgian:
|
||||
გთხოვთ ახლავე გაიაროთ რეგისტრაცია
|
||||
|
||||
Mayalayam:
|
||||
ബ്qരഹ്മപുരത്തേക്ക്
|
||||
Ethiopic:
|
||||
ሰማይ አይታረስ ንጉሥ አይከሰስ።
|
||||
|
||||
Runic:
|
||||
ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ
|
||||
|
||||
Russian:
|
||||
В чащах юга жил бы цитрус?
|
||||
Да, но фальшивый экземпляр!
|
||||
|
||||
Thai:
|
||||
พ่อขุนรามคำแหงมหาราช
|
||||
|
||||
Uyghur:
|
||||
زۆھرەگۈل ئابدۇۋاجىت فرانسىيەنىڭ
|
||||
پارىژدىكى خېلى بىشەم ئوقۇغۇچى.
|
||||
|
||||
Various latin alphabet variants:
|
||||
𝐛𝐨𝐥𝐝 𝖋𝖗𝖆𝖐𝖙𝖚𝖗 𝒊𝒕𝒂𝒍𝒊𝒄 𝓼𝓬𝓻𝓲𝓹𝓽
|
||||
𝕕𝕠𝕦𝕓𝕝𝕖-𝕤𝕥𝕣𝕦𝕔𝕜 𝚖𝚘𝚗𝚘𝚜𝚙𝚊𝚌𝚎
|
||||
|
@ -91,26 +71,6 @@ Emoji:
|
|||
🔀 🔁 🔂 🔃 🔄 🔅 🔆 🔇
|
||||
😀 😁 😂 😃 😄 😅 😆 😇
|
||||
|
||||
Line breaking:
|
||||
|
||||
U+000A LINE FEED
|
||||
LINE 1
|
||||
LINE 2
|
||||
|
||||
U+000A LINE FEED
|
||||
LINE 1
LINE 2
|
||||
|
||||
U+000D CARRIAGE RETURN
|
||||
U+000A LINE FEED
|
||||
LINE 1
|
||||
LINE 2
|
||||
|
||||
U+0085 NEXT LINE
|
||||
LINE 1
LINE 2
|
||||
|
||||
U+2029 PARAGRAPH SEPARATOR
|
||||
LINE 1
LINE 2
|
||||
|
||||
Horizontal tabulator:
|
||||
1 2 3 4 5 6
|
||||
t a b
|
||||
|
@ -147,10 +107,8 @@ Combining characters:
|
|||
U+0EB3 LAO VOWEL SIGN AM
|
||||
ກຳ
|
||||
|
||||
Bidirectional text:
|
||||
Thai:
|
||||
พ่อขุนรามคำแหงมหาราช
|
||||
|
||||
Year rendered before hebrew name, no LRM:
|
||||
Rivlin (ראובן ריבלין; * 1939 Jerusalem)
|
||||
|
||||
Year rendered after hebrew name with LRM:
|
||||
Rivlin (ראובן ריבלין; * 1939 Jerusalem)
|
||||
Mayalayam:
|
||||
ബ്qരഹ്മപുരത്തേക്ക്
|
||||
|
|
29
hexfont.lua
29
hexfont.lua
|
@ -14,7 +14,6 @@ Overflow oder eine Format String Vulnerability zwischen die anderen
|
|||
Codezeilen und schreibe das auch nicht dran.
|
||||
]]--
|
||||
|
||||
dofile("bidi.lua")
|
||||
dofile("pixelops.lua")
|
||||
dofile("unicodedata.lua")
|
||||
dofile("utf8.lua")
|
||||
|
@ -226,9 +225,8 @@ hexfont.render_line = function(self, text)
|
|||
for i = 1, 16 do
|
||||
result[i] = {}
|
||||
end
|
||||
local codepoints = bidi.get_visual_reordering(
|
||||
utf8.text_to_codepoints(text)
|
||||
)
|
||||
local codepoints = utf8.text_to_codepoints(text)
|
||||
-- FIXME: only works for LTR, should use UAX #9
|
||||
for i = 1, #codepoints do
|
||||
local codepoint = codepoints[i]
|
||||
local bitmap_hex = self[codepoint]
|
||||
|
@ -296,28 +294,7 @@ hexfont.render_text = function(self, text)
|
|||
|
||||
local result
|
||||
local max_width = 0
|
||||
-- According to UAX #14, line breaks happen on:
|
||||
-- • U+000A LINE FEED
|
||||
-- • U+000D CARRIAGE RETURN (except as part of CRLF)
|
||||
-- • U+0085 NEXT LINE
|
||||
-- • U+2029 PARAGRAPH SEPARATOR
|
||||
--
|
||||
-- Hack: Replace all of those with LINE FEED.
|
||||
-- FIXME: This makes CRLF into two newlines …
|
||||
local codepoints = utf8.text_to_codepoints(text)
|
||||
for i, codepoint in ipairs(codepoints) do
|
||||
if (
|
||||
0x000D == codepoints[i] or
|
||||
0x0085 == codepoints[i] or
|
||||
0x2029 == codepoints[i]
|
||||
) then
|
||||
codepoints[i] = 0x000A
|
||||
end
|
||||
end
|
||||
-- FIXME: Code below should only operate on codepoints! Converting
|
||||
-- back and forth makes it needlessly slow – but I do not know how
|
||||
-- to split a table properly to get a single table for each line …
|
||||
text = utf8.codepoints_to_text(codepoints)
|
||||
-- TODO: implement UAX #14
|
||||
for utf8_line in string.gmatch(text .. "\n", "([^\n]*)\n") do
|
||||
local pixels = self:render_line(utf8_line)
|
||||
assert( nil ~= pixels )
|
||||
|
|
62
utf8.lua
62
utf8.lua
|
@ -16,68 +16,6 @@ Codezeilen und schreibe das auch nicht dran.
|
|||
|
||||
utf8 = {}
|
||||
|
||||
-- convert a table with codepoints into an UTF-8 string
|
||||
-- inspired by <http://news.dieweltistgarnichtso.net/bin/unicode>
|
||||
utf8.codepoints_to_text = function(codepoints)
|
||||
assert(
|
||||
"table" == type(codepoints)
|
||||
)
|
||||
for _, codepoint in ipairs(codepoints) do
|
||||
if (
|
||||
0 >= codepoint or
|
||||
1114111 < codepoint or
|
||||
math.floor(codepoint) ~= codepoint
|
||||
) then
|
||||
error(
|
||||
string.format(
|
||||
"invalid codepoint: %s",
|
||||
codepoint
|
||||
)
|
||||
)
|
||||
end
|
||||
end
|
||||
local codepoints_encoded = {}
|
||||
codepoints_encoded.append = function(...)
|
||||
codepoints_encoded[#codepoints_encoded+1] = string.char(...)
|
||||
end
|
||||
for _, codepoint in ipairs(codepoints) do
|
||||
if codepoint <= 127 then
|
||||
-- one byte encoding
|
||||
codepoints_encoded.append(codepoint)
|
||||
elseif codepoint <= 2048 then
|
||||
-- two bytes encoding
|
||||
codepoints_encoded.append(
|
||||
math.floor(codepoint / 64) + 192,
|
||||
codepoint % 64 + 128
|
||||
)
|
||||
elseif codepoint <= 65535 then
|
||||
-- three bytes encoding
|
||||
codepoints_encoded.append(
|
||||
math.floor(codepoint / 4096) + 224,
|
||||
math.floor(codepoint / 64) % 64 + 128,
|
||||
codepoint % 64 + 128
|
||||
)
|
||||
elseif codepoint <= 1114111 then
|
||||
-- four bytes encoding
|
||||
codepoints_encoded.append(
|
||||
math.floor(codepoint / 262144) + 240,
|
||||
math.floor(codepoint / 4096) % 64 + 128,
|
||||
math.floor(codepoint / 64) % 64 + 128,
|
||||
codepoint % 64 + 128
|
||||
)
|
||||
end
|
||||
end
|
||||
return table.concat(codepoints_encoded)
|
||||
end
|
||||
|
||||
-- Test one codepoint for each byte length:
|
||||
local text = utf8.codepoints_to_text(
|
||||
{119, 240, 9829, 66376} -- U+0077 U+00F0 U+2665 U+10348
|
||||
)
|
||||
assert(
|
||||
"wð♥𐍈" == text
|
||||
)
|
||||
|
||||
-- convert an UTF-8 string into a table with codepoints
|
||||
-- inspired by <https://lua-users.org/wiki/LuaUnicode>
|
||||
utf8.text_to_codepoints = function(text)
|
||||
|
|
Loading…
Reference in New Issue