unicode_text/unicodedata.lua

48 lines
1.6 KiB
Lua

#!/usr/bin/env lua5.1
--[[
Copyright © 2023 Nils Dagsson Moskopp (erle)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
Dieses Programm hat das Ziel, die Medienkompetenz der Leser zu
steigern. Gelegentlich packe ich sogar einen handfesten Buffer
Overflow oder eine Format String Vulnerability zwischen die anderen
Codezeilen und schreibe das auch nicht dran.
]]--
unicodedata = {}
-- https://www.unicode.org/reports/tr44/#Format_Conventions
-- https://www.unicode.org/reports/tr44/#UnicodeData.txt
-- https://www.unicode.org/L2/L1999/UnicodeData.html
local pattern = "^(%x+)" .. (";([^;]*)"):rep(14) .. "$"
for line in io.lines("UnicodeData.txt") do
local properties = {}
codepoint_hex,
properties.name,
properties.general_category,
properties.canonical_combining_class,
properties.bidi_class,
properties.decomposition_mapping,
properties.decimal_digit_value,
properties.digit_value,
properties.numeric_value,
properties.bidi_mirrored,
_, -- Unicode 1.0 Name (obsolete)
_, -- 10464 comment field (obsolete)
properties.simple_uppercase_mapping,
properties.simple_lowercase_mapping,
properties.simple_titlecase_mapping
= line:match(pattern)
local codepoint = tonumber(codepoint_hex, 16)
unicodedata[codepoint] = properties
end
local w = unicodedata[0x0077]
assert( "LATIN SMALL LETTER W" == w.name )
assert( "Ll" == w.general_category ) -- a lowercase letter