Module:Unicode data: Difference between revisions
Module:Unicode data (view source)
Revision as of 02:46, 16 September 2025
, 02:46, 16 September 2025Add Khitan Small Script (which was omitted so showed as "reserved" in the Unichar template), condense contiguous ranges
m (1 revision imported) |
En:Wp>Drmccreedy (Add Khitan Small Script (which was omitted so showed as "reserved" in the Unichar template), condense contiguous ranges) |
||
| Line 58: | Line 58: | ||
-- Unicode Specification: | -- Unicode Specification: | ||
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf | -- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf | ||
-- binary_range_search assumes these are ordered by codepoint. Do not place them in a random order! | |||
local name_hooks = { | local name_hooks = { | ||
{ 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters | { 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters | ||
| Line 77: | Line 78: | ||
{ 0xD800, 0xDFFF, "<surrogate-%04X>" }, | { 0xD800, 0xDFFF, "<surrogate-%04X>" }, | ||
{ 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use | { 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use | ||
-- CJK Compatibility Ideographs | -- CJK Compatibility Ideographs | ||
{ 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | { 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | ||
{ 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | { 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | ||
{ 0x17000, | { 0xFE00, 0xFE0F, function (codepoint) -- Variation Selectors | ||
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xFE00 + 1) | |||
end}, | |||
{ 0x13460, 0x143FA, "EGYPTIAN HIEROGLYPH-%04X" }, -- Egyptian Hieroglyphs Extended-A | |||
{ 0x17000, 0x187FF, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph | |||
{ 0x18800, 0x18AFF, function (codepoint) | { 0x18800, 0x18AFF, function (codepoint) | ||
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) | return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) | ||
end }, | end }, | ||
{ 0x18D00, | { 0x18B00, 0x18CD5, "KHITAN SMALL SCRIPT CHARACTER-%04X" }, -- Khitan Small Script | ||
{ 0x18CFF, 0x18CFF, "KHITAN SMALL SCRIPT CHARACTER-%04X" }, -- Khitan Small Script | |||
{ 0x18D00, 0x18D1E, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement | |||
{ 0x18D80, 0x18DF2, function (codepoint) | |||
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x18A7F) | |||
end }, | |||
{ 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu | { 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu | ||
{ 0x20000, 0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B | { 0x20000, 0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B | ||
{ 0x2A700, | { 0x2A700, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C, D | ||
{ 0x2B820, 0x2CEAD, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E | |||
{ 0x2B820, | |||
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F | { 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F | ||
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) | { 0x2EBF0, 0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I | ||
{ | { 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) | ||
{ 0x30000, 0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G | |||
{ 0x31350, 0x33479, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H, J | |||
{ 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement | { 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement | ||
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) | return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) | ||
end}, | end}, | ||
{ 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use | { 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use | ||
{ 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use | { 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use | ||
| Line 126: | Line 135: | ||
--]] | --]] | ||
function p.is_noncharacter(codepoint) | |||
function p. | |||
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned | -- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned | ||
-- (Cn) and specifically noncharacters: | -- (Cn) and specifically noncharacters: | ||
-- https://www.unicode.org/faq/private_use.html#nonchar4 | -- https://www.unicode.org/faq/private_use.html#nonchar4 | ||
return 0xFDD0 <= codepoint and (codepoint <= 0xFDEF | |||
or floor(codepoint % 0x10000) >= 0xFFFE) then | or floor(codepoint % 0x10000) >= 0xFFFE) | ||
end | |||
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 | |||
function p.lookup_name(codepoint) | |||
if p.is_noncharacter(codepoint) then | |||
return ("<noncharacter-%04X>"):format(codepoint) | return ("<noncharacter-%04X>"):format(codepoint) | ||
end | end | ||
| Line 435: | Line 449: | ||
local Latn = false | local Latn = false | ||
local i = 0; -- indexer for use in error messages | |||
for codepoint in mw.ustring.gcodepoint(str) do | for codepoint in mw.ustring.gcodepoint(str) do | ||
i = i + 1; -- bump the indexer | |||
local script = lookup_script(codepoint) | local script = lookup_script(codepoint) | ||
| Line 443: | Line 459: | ||
elseif not (script == "Zyyy" or script == "Zinh" | elseif not (script == "Zyyy" or script == "Zinh" | ||
or script == "Zzzz") then | or script == "Zzzz") then | ||
return false | return false, i -- abandon as not Latn; identify the offending character's position | ||
end | end | ||
end | end | ||
return Latn | return Latn, (not Latn and i) or nil -- when <Latn> false, return offending charactor's position as second return value; nil else | ||
end | end | ||
| Line 542: | Line 558: | ||
local codepoint = get_codepoint(frame.args, 2) | local codepoint = get_codepoint(frame.args, 2) | ||
return (func(codepoint)) -- Adjust to one result. | return (func(codepoint)) -- Adjust to one result. | ||
end | |||
end | |||
function p.lookup_kCantonese(codepoint) | |||
local data = loader[('Unihan/kCantonese/%02X'):format(floor(codepoint / 0x1000))] | |||
if data then | |||
return data[codepoint] | |||
end | end | ||
end | end | ||
return p | return p | ||