local p =
local floor = math.floor
local function errorf(level, ...) if type(level)
local function binary_range_search(codepoint, ranges) local low, mid, high low, high = 1, ranges.length or require "Module:TableTools".length(ranges) while low <= high do mid = floor((low + high) / 2) local range = ranges[mid] if codepoint < range[1] then high = mid - 1 elseif codepoint <= range[2] then return range, mid else low = mid + 1 end end return nil, midendp.binary_range_search = binary_range_search
--local function linear_range_search(codepoint, ranges) for i, range in ipairs(ranges) do if range[1] <= codepoint and codepoint <= range[2] then return range end endend--
-- Load a module by indexing "loader" with the name of the module minus the-- "Module:Unicode data/" part. For instance, loader.blocks returns-- . If a module cannot be loaded, false will be-- returned.local loader = setmetatable
-- For the algorithm used to generate Hangul Syllable names,-- see "Hangul Syllable Name Generation" in section 3.12 of the-- Unicode Specification:-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdflocal name_hooks = name_hooks.length = #name_hooks
local name_range_cache
local function generate_name(data, codepoint) if type(data)
---- Checks that the code point is a number and in range.-- Does not check whether code point is an integer.-- Not usedlocal function check_codepoint(funcName, argIdx, val) require 'libraryUtil'.checkType(funcName, argIdx, val, 'number') if codepoint < 0 or 0x10FFFF < codepoint then errorf("Codepoint %04X out of range", codepoint) endend--
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8function p.lookup_name(codepoint) -- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned -- (Cn) and specifically noncharacters: -- https://www.unicode.org/faq/private_use.html#nonchar4 if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF or floor(codepoint % 0x10000) >= 0xFFFE) then return ("
if name_range_cache -- Check if previously used "name hook" applies to this code point. and codepoint >= name_range_cache[1] and codepoint <= name_range_cache[2] then return generate_name(name_range_cache[3], codepoint) end local range = binary_range_search(codepoint, name_hooks) if range then name_range_cache = range return generate_name(range[3], codepoint) end
local data = loader[('names/%03X'):format(codepoint / 0x1000)] if data and data[codepoint] then return data[codepoint] -- Unassigned (Cn) consists of noncharacters and reserved characters. -- The character has been established not to be a noncharacter, -- and if it were assigned, its name would already been retrieved, -- so it must be reserved. else return ("
--
local planes =
-- Load if needed and assign it to this variable.local blocks
local function block_iter(blocks, i) i = i + 1 local data = blocks[i] if data then -- Unpack doesn't work on tables loaded with mw.loadData. return i, data[1], data[2], data[3] endend
-- An ipairs-type iterator generator for the list of blocks.function p.enum_blocks local blocks = loader.blocks return block_iter, blocks, 0end
function p.lookup_plane(codepoint) local i = floor(codepoint / 0x10000) return planes[i] or ("Plane %u"):format(i)end
function p.lookup_block(codepoint) local blocks = loader.blocks local range = binary_range_search(codepoint, blocks) if range then return range[3] else return "No Block" endend
function p.get_block_info(name) for i, block in ipairs(loader.blocks) do if block[3]
function p.is_valid_pagename(pagename) local has_nonws = false
for cp in mw.ustring.gcodepoint(pagename) do if (cp
0x005B) -- [or (cp == 0x005D) -- ] or (cp
0x180E) -- MONGOLIAN VOWEL SEPARATOR or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block or (cp
local printable, result = p.is_printable(cp) if not printable then return false end
if result ~= "space-separator" then has_nonws = true end end
return has_nonwsend
local function manual_unpack(what, from) if what[from + 1]
local function compare_ranges(range1, range2) return range1[1] < range2[1]end
-- Creates a function to look up data in a module that contains "singles" (a-- code point-to-data map) and "ranges" (an array containing arrays that contain-- the low and high code points of a range and the data associated with that-- range).-- "loader" loads and returns the "singles" and "ranges" tables.-- "match_func" is passed the code point and either the data or the "dots", and-- generates the final result of the function.-- The varargs ("dots") describes the default data to be returned if there wasn't-- a match.-- In case the function is used more than once, "cache" saves ranges that have-- already been found to match, or a range whose data is the default if there-- was no match.local function memo_lookup(data_module_subpage, match_func, ...) local dots = local cache = local singles, ranges
return function (codepoint) if not singles then local data_module = loader[data_module_subpage] singles, ranges = data_module.singles, data_module.ranges end
if singles[codepoint] then return match_func(codepoint, singles[codepoint]) end
local range = binary_range_search(codepoint, cache) if range then return match_func(codepoint, manual_unpack(range, 3)) end local range, index = binary_range_search(codepoint, ranges) if range then table.insert(cache, range) table.sort(cache, compare_ranges) return match_func(codepoint, manual_unpack(range, 3)) end if ranges[index] then local dots_range if codepoint > ranges[index][2] then dots_range = else -- codepoint < range[index][1] dots_range = end table.sort(cache, compare_ranges) end return match_func(codepoint) endend
-- Get a code point's combining class value in,-- and return whether this value is not zero. Zero is assigned as the default-- if the combining class value is not found in this data module.-- That is, return true if character is combining, or false if it is not.-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for-- more information.p.is_combining = memo_lookup("combining", function (codepoint, combining_class) return combining_class and combining_class ~= 0 or false end, 0)
p.lookup_combiningclass = memo_lookup("combining", function (codepoint, combining_class) return combining_class end, 0)
function p.add_dotted_circle(str) return (mw.ustring.gsub(str, ".", function(char) if p.is_combining(mw.ustring.codepoint(char)) then return '◌' .. char end end))end
local lookup_control = memo_lookup("control", function (codepoint, ccc) return ccc or "assigned" end, "assigned")p.lookup_control = lookup_control
function p.is_assigned(codepoint) return lookup_control(codepoint) ~= "unassigned"end
function p.is_printable(codepoint) local result = lookup_control(codepoint) return (result
"space-separator"), resultend
function p.is_whitespace(codepoint) local result = lookup_control(codepoint) return (result
p.lookup_category = memo_lookup("category", function (codepoint, category) return category end, "Cn")
local lookup_script = memo_lookup("scripts", function (codepoint, script_code) return script_code or 'Zzzz' end, "Zzzz")p.lookup_script = lookup_script
function p.get_best_script(str) -- Check type of argument, because mw.text.decode coerces numbers to strings! require "libraryUtil".checkType("get_best_script", 1, str, "string") -- Convert HTML character references (including named character references, -- or character entities) to characters. str = mw.text.decode(str, true) local scripts = for codepoint in mw.ustring.gcodepoint(str) do local script = lookup_script(codepoint) -- Ignore "Inherited", "Undetermined", or "Uncoded" scripts. if not (script
"Zinh" or script
function p.is_Latin(str) require "libraryUtil".checkType("get_best_script", 1, str, "string") str = mw.text.decode(str, true) -- Search for the leading bytes that introduce the UTF-8 encoding of the -- code points U+0340-U+10FFFF. If they are not found and there is at least -- one Latin-script character, the string counts as Latin, because the rest -- of the characters can only be Zyyy, Zinh, and Zzzz. -- The only scripts found below U+0370 (the first code point of the Greek -- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz. -- See the codepage in the UTF-8 article. if not str:find "[\205-\244]" then for codepoint in mw.ustring.gcodepoint(str) do if lookup_script(codepoint)
"Latn" then Latn = true elseif not (script
"Zinh" or script
-- Checks that a string contains only characters belonging to right-to-left-- scripts, or characters of ignorable scripts.function p.is_rtl(str) require "libraryUtil".checkType("get_best_script", 1, str, "string") str = mw.text.decode(str, true) -- Search for the leading bytes that introduce the UTF-8 encoding of the -- code points U+0580-U+10FFFF. If they are not found, the string can only -- have characters from a left-to-right script, because the first code point -- in a right-to-left script is U+0591, in the Hebrew block. if not str:find "[\214-\244]" then return false end local result = false local rtl = loader.scripts.rtl for codepoint in mw.ustring.gcodepoint(str) do local script = lookup_script(codepoint) if rtl[script] then result = true elseif not (script
"Zinh" or script
local function get_codepoint(args, arg) local codepoint_string = args[arg] or errorf(2, "Parameter %s is required", tostring(arg)) local codepoint = tonumber(codepoint_string, 16) or errorf(2, "Parameter %s is not a code point in hexadecimal base", tostring(arg)) if not (0 <= codepoint and codepoint <= 0x10FFFF) then errorf(2, "code point in parameter %s out of range", tostring(arg)) end return codepointend
local function get_func(args, arg, prefix) local suffix = args[arg] or errorf(2, "Parameter %s is required", tostring(arg)) suffix = mw.text.trim(suffix) local func_name = prefix .. suffix local func = p[func_name] or errorf(2, "There is no function '%s'", func_name) return funcend
-- This function allows any of the "lookup" functions to be invoked. The first-- parameter is the word after "lookup_"; the second parameter is the code point-- in hexadecimal base.function p.lookup(frame) local func = get_func(frame.args, 1, "lookup_") local codepoint = get_codepoint(frame.args, 2) local result = func(codepoint) if func
function p.is(frame) local func = get_func(frame.args, 1, "is_") -- is_Latin and is_valid_pagename take strings. if func
p.is_valid_pagename or func
return p