Module:Sandbox/DePiep/uchar explained

-- todo split consist Char and Args-- todo cwith double dotcircle 230/239, 233, 234-- thought: option "speccial notes", listing: "whitesace, control, combining, NaC, .."require('strict')local p = local getArgs = require('Module:Arguments').getArgslocal uChar_data = mw.loadData('Module:Sandbox/DePiep/uchar/data')local uData = require('Module:Unicode data')local uData_helper = require('Module:Sandbox/DePiep/uchar-helper')local uBaseConvert = require('Module:BaseConvert')local yesno = require('Module:Yesno')local str = require('Module:String')local plaintext = require('Module:Plain text')--- local tabletools = require('Module:TableTools')local ERRstatus = local tUchar =

local DOTTED_CIRCLE = '◌' -- U+25CClocal NBSP = ' ' -- U+00A0  local LEFT_TO_RIGHT_MARK = '‎' -- U+200E LEFT-TO-RIGHT MARK (‎) local DEFAULT_IMAGE_SIZE = '21px'local WS_BLUE = 'lightblue'

local function testH(s)local h = mw.html.create('span')h :attr('id', 'testH') :tag('big') :css('background', WS_BLUE) :wikitext(s) --:newline

return tostring(h)end

local function addStyles(tChar) local h = mw.html.create('span')

h :attr('id', 'testH') :css('font-size', '150%') :wikitext(tChar.uChar) if tChar.uIsWhitespace

true then h:css('background', WS_BLUE) end --:newline

return tostring(h)end

function p.testH(frame)local origArgs = getArgs(frame) return testH(origArgs[1])end

function p.testFromDoc(frame)local div = mw.html.create('div')div :attr('id', 'testdiv') :css('width', '100%') :wikitext('Some text') :tag('hr') return tostring(div)-- Output:

Some text


end

-- FORMATTERS

local function inTag(s, arg, val, divspan)local objlocal rprt = if divspan

'div' or divspan

'span' then else return nil -- ERR end

return s, rprtend

local function decodeString(s) if s

nil then return nil end return mw.text.decode(s)end

-- Format string in tag / from m:str find word -- replaces whitespace by single nbsp (keep untrimmed ws visible) local function inCode(s) if s == nil then return '' end s = string.gsub(s, '%s+', '&nbsp;') return '<code>' .. s .. ''end

-- Use mono font-family (from: Template:Mono)local function inMono(s) if s

nil then s = end s = string.gsub(s, '%s+', ' ') return '

' .. s .. ''end

local function inSmallcaps(s) if (s

nil) or (s

) then return end -- '' -- Smallcaps/styles.css: span.smallcaps local sc -- sc = '' sc = '

' .. s .. '' return scend

local function xlLinkFileFormat(uHexBare0x, uHexFormat, sGenCat)-- depending on parameter used, xlink one of two if uHexBare0x ~= nil then -- Character data page -- https://www.fileformat.info/info/unicode/char/00ad/index.htm (or "/ad/"); no 0x no uc return '.. string.lower(uHexBare0x) .. '/index.htm ff.info ' .. uHexFormat .. '' else -- GenCat list, for example gencat "Nd": -- https://www.fileformat.info/info/unicode/category/Nd/list.htm return '.. sGenCat .. '/list.htm ff.info ' .. sGenCat .. '' endend

-- UHEX HANDLERS & FORMATTERS ----- ----- ----- ----- ----- ----- ----- ----- ----- local function formatUhex(uHex0x, uLink)-- formatting into normalform "U+00A9"local uHexFmt -- working uHexFmt = string.gsub(uHex0x, '^0x', ) uHexFmt = string.gsub(uHexFmt, '^0*', ) uHexFmt = 'U+' .. string.sub('0000' .. uHexFmt, - math.max(#uHexFmt, 4)) if uLink ~= nil then return uHexFmt .. '_[todo: fmt Uhex_link_U+]' end return uHexFmtend

local function formatGenCat(sGenCat, fmt)local tCat tCat = uChar_data.tGenCat[sGenCat] if tCat

nil then return end return inMono(sGenCat) .. '=' .. tCat[1]end

-- Formats table (array) using concat-- replace space by nbsp (keep untrimmed sp)-- in monospace font-familylocal function formatTablelist(t) -- unused?local s = if t

nil then return '' end s = table.concat(t, '; ') s = mw.text.decode(string.gsub(s, '%s+', ' ')) s = '<' .. inMono(s) .. '>' return send

local function formatCombiningChar(is_combining, cWith)local addPrefixlocal uCombWith -- working, cWith logiclocal rprt-- todo need 4-way logic for cwith cWith = decodeString(cWith) rprt = 'is_combi: ' .. tostring(is_combining) .. '; cwith: ' .. tostring(cWith)

-- strip wikicode; but save NBSP -- todo improve, test if cWith ~= nil then cWith = string.gsub(cWith, NBSP, 'NBSP') cWith = plaintext._main(cWith, false) cWith = string.gsub(cWith, 'NBSP', NBSP) end uCombWith = yesno(cWith) -- y/n/nil (3-way logic; 'foo'

nil) addPrefix = if (cWith

nil) or (uCombWith

true) then -- default: per is_combining rprt = rprt .. '_dflt non-combi = none' if is_combining

true then addPrefix = DOTTED_CIRCLE rprt = rprt .. '_dflt' end elseif uCombWith

false then -- explicitly false, so suppress addPrefix = rprt = rprt .. '_false, suppress' else -- use character provided by cwith addPrefix = cWith rprt = rprt .. '_cleanchar: ' .. tostring(cWith) end return addPrefix, rprtend

-- READ & PROCESS

local function convertHexInToHex0x(uHexAnyform)local uHexBare0xlocal uHex0x -- targetslocal uHexNumlocal uHexFormat

if (uHexAnyform

nil) or (uHexAnyform

) then ERRstatus ='ERR convertHexInToHex0x: no uHex input' return nil end uHexBare0x = decodeString(uHexAnyform) uHexBare0x = string.gsub(uHexBare0x, '%s', ) uHexBare0x = string.gsub(uHexBare0x, '^U%+', ) uHexBare0x = string.gsub(uHexBare0x, '^0x', ) uHexBare0x = string.upper(uHexBare0x) uHex0x = '0x' .. uHexBare0x -- number check uHexNum = tonumber(uHex0x) -- kills NaN, todo: test this if uHexNum

nil then ERRstatus ='ERR convertHexInToHex0x: uHex is not hex: >' .. tostring(uHexNum) .. '<' return nil elseif (uHexNum < 0) or (uHexNum > 0x10FFFF) then ERRstatus ='ERR convertHexInToHex0x: uHex out of U+ range' .. uHex0x return nil end uHexFormat = formatUhex(uHex0x)

return uHex0x, uHexNum, uHexBare0x, uHexFormatend

local function convertHexToDec(uHex0x)local xVal if uHex0x

nil then return nil end xVal = uBaseConvert.convert return xValend

local function convertDecToHex(uDec)-- todo: dec input is NaN, err, edge if uDec

nil then return nil end return uBaseConvert.convertend

-- GET DATA

local function getBlock(uHexNum) uData.lookup_block(uHexNum) return 'blck'end

local function getPlane(uHexNum)local i = math.floor(uHexNum / 0x10000) return i .. ': ' .. uChar_data.tPlanes[i]end

local function getCombiningClass(uHex0x)-- CCC-- todo: 239 (230), 233, 234 = between spacing chars.local ccc

ccc = uData_helper.lookup_combiningclass(uHex0x) or -- new -helper function

return cccend

local function getNamedEntities(uDec, fmt)-- returns from datalist, by decimal val:-- formatted into concat.table list-- demo: [168]='&uml;, &die;, &Dot;, &DoubleDot;'local tNamedEntitiesData = mw.loadData('Module:Numcr2namecr')local sNameList local tNames= ---- uDec=169-- fmt = report -- id = decimal input sNameList = tNamedEntitiesData[tonumber(uDec)] if sNameList

nil then return nil end sNameList = decodeString(sNameList) -- has literal '&' in source

local patstring = '%f[^&][^%;]+%f[%;]' local hitCount = 0 local hitWord = while hitCount <= 20 do hitCount = hitCount + 1 hitWord = str._match(sNameList, patstring, 1, hitCount, false, ) hitWord = mw.text.trim(hitWord) if hitWord ~= then table.insert(tNames, inMono('&' .. hitWord .. ';')) elseif hitWord

then -- no more hits in the string break end end return table.concat(tNames, '  ') -- double spacedend

local function getAliases(uHex)-- returns t5 = 5 alias tables named by reason-- demo 0x002118 = weierlocal tAllAliases = mw.loadData('Module:Unicode data/aliases')local tCPalias =

tCPalias = tAllAliases[uHex] if tCPalias

nil then return nil end

-- for 2-deep 5-subtable (Aliases)local tAlias5 = local abbreviation = local alternate = local correction = local control = local figment =

tAlias5["abbreviation"] = abbreviationtAlias5["alternate"] = alternatetAlias5["control"] = controltAlias5["correction"] = correctiontAlias5["figment"] = figment

for i, v in ipairs(tCPalias) do -- i = counter, v[i] = table (1/5), v[2] = tablename (alias, 1/5) if type(v)

'table' then table.insert(tAlias5[v[1]], v[2]) end end return tAlias5end

local function getScriptName(sScriptISO)local sNamelocal UDscripts = mw.loadData('Module:Unicode data/scripts') if sScriptISO

nil then return nil end

sName = UDscripts.aliases[sScriptISO] or nil if sName

nil then sName = '_unk' end return sNameend

local function formatAlias5(t5Alias, fmt)local sReport if t5Alias

nil then return nil end -- fmt = report sReport = '
ALIASES: ' for k, v in pairs(t5Alias) do if #v > 0 then sReport = sReport .. ' ' .. k .. ': ' .. table.concat(v, '; ') end end return sReportend

-- 1. PARSE INCOMING ARGS-- 2. READ PROPERTIESlocal function getArgsAndProps(origArgs)local tNewArgs =

local inHex, inDec, inChar = 1, 2, 3 -- 'inHex', 'inDec', 'inChar'local tOrigIn = local uHexIn = -1 -- the base inputlocal uHex0x, uHexNum -- local working val--xx-- PART 1 READ & NORMALISE ORIG ARGUMENTS -- HEX DEC CHARlocal rprt = 'R-t0:' .. #tOrigIn tOrigIn[inHex] = (origArgs[1] or origArgs['hex']) or nil -- todo: split for check? tOrigIn[inDec] = origArgs['dec'] or nil tOrigIn[inChar] = decodeString(origArgs['char']) or nil

rprt = rprt .. ' R-t2:' .. #tOrigInfor n, v in pairs(tOrigIn) do if v ~= nil then rprt = rprt .. ' ' .. tostring(v) .. ';;' endend

if tOrigIn[inDec] ~= nil then uHexIn = convertDecToHex(tOrigIn[inDec]) rprt = rprt .. ' dec;' end if tOrigIn[inChar] ~= nil then uHexIn = convertDecToHex(mw.ustring.codepoint(tOrigIn[inChar])) rprt = rprt .. ' char;' end if tOrigIn[inHex] ~= nil then uHexIn = tOrigIn[inHex] rprt = rprt .. ' hex;' end

-- REPORT todo: what if >1 input?: err msg, prio, conflictcheck -- 2023-02-04: removed "\|" "invalid escape sequence" ??? tNewArgs['rprtOrigIDs'] = ' |ID in: #t4=' .. #tOrigIn .. ':>' .. rprt .. tostring(uHexIn) .. '<| '

-- returns: uHex0x, uHexNum, uHexBare0x, uHexFormat tNewArgs['uHex0x'], tNewArgs['uHexNum'], tNewArgs['uHexBare0x'], tNewArgs['uHexFormat'] = convertHexInToHex0x(uHexIn) if tNewArgs['uHex0x']

nil then -- ERROR -- shortcut to error #1: no uHex (valid 0x) input return tNewArgs end -- local shortcut only uHex0x = tNewArgs['uHex0x'] uHexNum = tNewArgs['uHexNum']

-- DEC tNewArgs['uDec'] = convertHexToDec(uHex0x) -- OTHER ORIG ARGS tNewArgs['uNameLink'] = origArgs['link'] or origArgs['nlink'] -- old nlink = depr paramname tNewArgs['format'] = origArgs['format'] or tNewArgs['cwith'] = decodeString(origArgs['cwith'])

tNewArgs['uSize'] = origArgs['size'] tNewArgs['uImage'] = origArgs['image']

tNewArgs['html'] = origArgs['html'] -- depr? tNewArgs['ulink'] = origArgs['ulink'] -- old ulink = depr?

-- test notice tNewArgs['test'] = origArgs['test'] or

-- PART 2 READ & USE PROPERTIES

-- ASSIGNED, GenCat, Control, Char tNewArgs['uIsAssigned'] = uData.is_assigned(uHexNum)

if tNewArgs['uIsAssigned']

true then tNewArgs['uGenCat'] = uData.lookup_category(uHexNum) tNewArgs['uChar'] = mw.text.decode('&#x' .. tNewArgs['uHex0x'] .. ';') else tNewArgs['uGenCat'] = 'Xx' -- todo not assigned

? tNewArgs['uChar'] = 'ERR_not_assg' -- ERROR end tNewArgs['uBlock'] = uData.lookup_block(uHexNum) tNewArgs['uPlane'] = getPlane(uHexNum) -- CHAR replacement if tNewArgs['uGenCat']

'Cc' then tNewArgs['uChar'] = '&#xFFFD;' -- '?' placeholder end

if tNewArgs['uGenCat']

'Cc' then -- assuming this is 1:1 tNewArgs['uIsControl'] = true else tNewArgs['uIsControl'] = false end

--NAME, ALIASES tNewArgs['uName'] = uData.lookup_name(uHexNum) tNewArgs['Aliases'] = getAliases(uHexNum) -- table5

--PROPS Script, Latin, WS tNewArgs['uIsLatin'] = uData.is_Latin(tostring(tNewArgs['uChar'])) tNewArgs['uScript'] = uData.lookup_script(uHexNum) tNewArgs['uScriptName'] = getScriptName(tNewArgs['uScript']) tNewArgs['uIsWhitespace'] = uData.is_whitespace(uHexNum)

--PROPS rtl tNewArgs['uIsRtl'] = uData.is_rtl(tostring(tNewArgs['uChar']))

--PROPS2 COMBINING PREFIX Combining/cwith/dottedcircle, CCC tNewArgs['uIsCombining'] = uData.is_combining(uHexNum) or false if yesno(tNewArgs['uIsCombining'], false)

true then -- todo: could do: read ccc, once ;-) tNewArgs['uCombiningClass'] = getCombiningClass(uHexNum) end tNewArgs['uCombiningClass'] = getCombiningClass(uHexNum) tNewArgs['uCharPrefix'], tNewArgs['uCwithReport'] = formatCombiningChar(tNewArgs['uIsCombining'], tNewArgs['cwith'])

-- CHAR SUFFFIX; rtl if tNewArgs['uIsRtl']

true then tNewArgs['uCharSuffix'] = LEFT_TO_RIGHT_MARK else tNewArgs['uCharSuffix'] = end

--PROPS3: NamedEntities tNewArgs['NamedEntities'] = getNamedEntities(convertHexToDec(uHex0x))

return tNewArgsend

function p._main (args) return '_todo _main'end

function p.main (frame)local origArgs = getArgs(frame,)local tArgs = local s =

tUchar = getArgsAndProps(origArgs) if tUchar['uHex0x']

nil then return ' >' .. (origArgs[1] or '?') .. '< ERR hexIn ' .. ERRstatus .. ' ' .. (tUchar['rprtOrigIDs'] or 'unk1') end

-- REPORT RPRT s = formatUhex(tUchar['uHex0x'])

--string together & css format tUchar.uChar = tUchar['uCharPrefix'] .. tUchar.uChar .. tUchar['uCharSuffix'] -- cwith, rtl, --- tUchar['styledChar'] = addStyles(tUchar) local cssChar cssChar = addStyles(tUchar) if tUchar['uImage'] ~= nil then s = s .. ' ' .. (tUchar['uSize'] or DEFAULT_IMAGE_SIZE) .. ' ' else --s = s .. ' ' .. tUchar['uCharPrefix'] .. tUchar.uChar .. tUchar['uCharSuffix'] .. ' ' s = s .. ' ' .. cssChar .. ' ' end

s = s .. inSmallcaps(tUchar['uName']) s = s .. '
[testing: ' .. tUchar['test'] .. ']' .. (tUchar['rprtOrigIDs'] or '?') .. '→ ' .. tUchar['uHex0x'] .. ' [' .. tUchar['uDec'] .. 'dec]'.. '; (' .. xlLinkFileFormat(tUchar['uHexBare0x'], tUchar['uHexFormat']) .. ') ' .. 'GC: ' .. formatGenCat(tUchar['uGenCat']) .. ' (' .. xlLinkFileFormat(nil, nil, tUchar['uGenCat']) .. ')' .. '
ASSIG: ' .. tostring(tUchar['uIsAssigned']) .. '; ' .. 'WS: '.. tostring(tUchar['uIsWhitespace']) .. '
BLK: ' .. tUchar['uBlock'] .. '; PLANE: ' .. tUchar['uPlane'] .. '; ' .. '
SC: ' .. tUchar['uScript'] .. '=' .. tUchar['uScriptName'] .. '; RTLsuffix:' .. tostring(tUchar['uIsRtl']) .. '; ' s = s .. '
COMBI PREFIX: >' .. tUchar['uCharPrefix'] .. '<; ' .. tUchar['uCwithReport'] .. '; CCC class:' .. (tUchar['uCombiningClass'] or '-')

if tUchar['NamedEntities'] ~= nil then s = s .. '
NAMED ENTITIES: ' .. tUchar['NamedEntities'] end

if tUchar['Aliases'] ~= nil then s = s .. formatAlias5(tUchar['Aliases'], 'report') end return send

function p.test(frame) local sChar sChar = frame.args['char'] return mw.ustring.codepoint(sChar, 1, 2)end

function p.testScriptName(frame) local sISOid sISOid = frame.args[1] return getScriptName(sISOid) end

return p