local subexport =
local cp = mw.ustring.codepointlocal floor = math.floorlocal min = math.minlocal split = mw.text.split
-- Copied from .local function binaryRangeSearch(codepoint, ranges) local low, mid, high low, high = 1, ranges.length or require "Module:table".length(ranges) while low <= high do mid = floor((low + high) / 2) local range = ranges[mid] if codepoint < range[1] then high = mid - 1 elseif codepoint <= range[2] then return range, mid else low = mid + 1 end end return nil, midend
-- Copied from .local function linearRangeSearch(codepoint, ranges) for i, range in ipairs(ranges) do if codepoint < range[1] then break elseif codepoint <= range[2] then return range end endend
local function compareRanges(range1, range2) return range1[1] < range2[1]end
-- Save previously used codepoint ranges in case another character is in the-- same range.local rangesCache =
--[=[ Takes a codepoint or a character and finds the script code(s) (if any) that are appropriate for it based on the codepoint, using the data module [[Module:scripts/recognition data]]. The data module was generated from the patterns in using . By default, it returns only the first script code if there are multiple matches (i.e. the code we take to be the default). If `all_scripts` is set, then a table of all matching codes is returned.]=]
local charToScriptDatafunction subexport.charToScript(char, all_scripts) charToScriptData = charToScriptData or mw.loadData("Module:scripts/recognition data") local t = type(char) local codepoint if t
"number" then codepoint = char else error(("bad argument #1 to 'charToScript' (expected string or a number, got %s)") :format(t)) end local ret = local individualMatch = charToScriptData.individual[codepoint] if individualMatch then ret = split(individualMatch, "%s*,%s*") else local range if rangesCache[1] then range = linearRangeSearch(codepoint, rangesCache) if range then for i, script in ipairs(range) do if i > 2 then table.insert(ret, script) if not all_scripts then break end end end end end if not ret[1] then local index = floor(codepoint / 0x1000) range = linearRangeSearch(index, charToScriptData.blocks) if not range and charToScriptData[index] then range = binaryRangeSearch(codepoint, charToScriptData[index]) if range then table.insert(rangesCache, range) table.sort(rangesCache, compareRanges) end end if range then for i, script in ipairs(range) do if i > 2 then table.insert(ret, script) if not all_scripts then break end end end end end end if not ret[1] then table.insert(ret, "None") end if all_scripts then return ret else return ret[1] endend
--[=[ Finds the best script for a string in a language-agnostic way. Converts each character to a codepoint. Iterates the counter for the script code if the codepoint is in the list of individual characters, or if it is in one of the defined ranges in the 4096-character block that it belongs to. Each script has a two-part counter, for primary and secondary matches. Primary matches are when the script is the first one listed; otherwise, it's a secondary match. When comparing scripts, first the total of both are compared (i.e. the overall number of matches). If these are the same, the number of primary and then secondary matches are used as tiebreakers. For example, this is used to ensure that `Grek` takes priority over `Polyt` if no characters which exclusively match `Polyt` are found, as `Grek` is a subset of `Polyt`. ]=]function subexport.findBestScriptWithoutLang(text) -- `scripts` contains counters for any scripts detected so far. Jpan and Kore are handled as special-cases, as they are combinations of other scripts. local scripts_mt = local weights_mt = scripts_mt.__index = function(t, k) local ret = if k
"Kore" and scripts_mt.Kore then for i = 1, 2 do ret[i] = t["Hani"][i] + t["Hang"][i] end else for i = 1, 2 do table.insert(ret, 0) end end return setmetatable(ret, weights_mt) end local scripts = setmetatable(scripts_mt) text = require("Module:utilities").get_plaintext(text) local combined_scripts = for character in text:gmatch("[%z\1-\127\194-\244][\128-\191]*") do for i, script in ipairs(subexport.charToScript(character, true)) do scripts[script] = scripts[script] local weight = min(i, 2) scripts[script][weight] = scripts[script][weight] + 1 end end -- Check the combined script counts. If a single constituent has the same count (i.e. it's the only one), discard the combined script. for combined_script, set in pairs(combined_scripts) do for script in pairs(set) do scripts[combined_script] = scripts[combined_script] if (scripts[script][1] + scripts[script][2])
return subexport