Module:Unicode data: Difference between revisions

1,115 bytes added ,  02:46, 16 September 2025
Add Khitan Small Script (which was omitted so showed as "reserved" in the Unichar template), condense contiguous ranges
m (1 revision imported)
En:Wp>Drmccreedy
(Add Khitan Small Script (which was omitted so showed as "reserved" in the Unichar template), condense contiguous ranges)
Line 58: Line 58:
-- Unicode Specification:
-- Unicode Specification:
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
-- binary_range_search assumes these are ordered by codepoint. Do not place them in a random order!
local name_hooks = {
local name_hooks = {
{    0x00,    0x1F, "<control-%04X>" }, -- C0 control characters
{    0x00,    0x1F, "<control-%04X>" }, -- C0 control characters
Line 77: Line 78:
{  0xD800,  0xDFFF, "<surrogate-%04X>" },
{  0xD800,  0xDFFF, "<surrogate-%04X>" },
{  0xE000,  0xF8FF, "<private-use-%04X>" }, -- Private Use
{  0xE000,  0xF8FF, "<private-use-%04X>" }, -- Private Use
-- CJK Compatibility Ideographs
-- CJK Compatibility Ideographs
{  0xF900,  0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0xF900,  0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0xFA70,  0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0xFA70,  0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph
{  0xFE00,  0xFE0F, function (codepoint) -- Variation Selectors
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xFE00 + 1)
end},
{  0x13460,  0x143FA, "EGYPTIAN HIEROGLYPH-%04X" }, -- Egyptian Hieroglyphs Extended-A
{  0x17000,  0x187FF, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph
{  0x18800,  0x18AFF, function (codepoint)
{  0x18800,  0x18AFF, function (codepoint)
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
end },
end },
{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement
{  0x18B00,  0x18CD5, "KHITAN SMALL SCRIPT CHARACTER-%04X" }, -- Khitan Small Script
{  0x18CFF,  0x18CFF, "KHITAN SMALL SCRIPT CHARACTER-%04X" }, -- Khitan Small Script
{  0x18D00,  0x18D1E, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement
{  0x18D80,  0x18DF2, function (codepoint)
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x18A7F)
end },
{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
{  0x2A700,  0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
{  0x2A700,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C, D
{  0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
{  0x2B820,  0x2CEAD, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
0x2F8000x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{  0x313500x33479, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H, J
{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
end},
end},
{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use
Line 126: Line 135:
--]]
--]]


-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function p.is_noncharacter(codepoint)
function p.lookup_name(codepoint)
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
-- (Cn) and specifically noncharacters:
-- (Cn) and specifically noncharacters:
-- https://www.unicode.org/faq/private_use.html#nonchar4
-- https://www.unicode.org/faq/private_use.html#nonchar4
if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
return 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
or floor(codepoint % 0x10000) >= 0xFFFE) then
or floor(codepoint % 0x10000) >= 0xFFFE)
end
 
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function p.lookup_name(codepoint)
if p.is_noncharacter(codepoint) then
return ("<noncharacter-%04X>"):format(codepoint)
return ("<noncharacter-%04X>"):format(codepoint)
end
end
Line 435: Line 449:
local Latn = false
local Latn = false
local i = 0; -- indexer for use in error messages
for codepoint in mw.ustring.gcodepoint(str) do
for codepoint in mw.ustring.gcodepoint(str) do
i = i + 1; -- bump the indexer
local script = lookup_script(codepoint)
local script = lookup_script(codepoint)
Line 443: Line 459:
elseif not (script == "Zyyy" or script == "Zinh"
elseif not (script == "Zyyy" or script == "Zinh"
or script == "Zzzz") then
or script == "Zzzz") then
return false
return false, i -- abandon as not Latn; identify the offending character's position
end
end
end
end
return Latn
return Latn, (not Latn and i) or nil -- when <Latn> false, return offending charactor's position as second return value; nil else
end
end


Line 542: Line 558:
local codepoint = get_codepoint(frame.args, 2)
local codepoint = get_codepoint(frame.args, 2)
return (func(codepoint)) -- Adjust to one result.
return (func(codepoint)) -- Adjust to one result.
end
end
function p.lookup_kCantonese(codepoint)
local data = loader[('Unihan/kCantonese/%02X'):format(floor(codepoint / 0x1000))]
if data then
return data[codepoint]
end
end
end
end


return p
return p
Anonymous user