local ustring =
-- Copy these, just in caselocal S =
---- Configuration ------ To limit the length of strings or patterns processed, set theseustring.maxStringLength = math.hugeustring.maxPatternLength = math.huge
---- Utility functions ----
local function checkType(name, argidx, arg, expecttype, nilok) if arg
local function checkString(name, s) if type(s)
local function checkPattern(name, pattern) if type(pattern)
-- A private helper that splits a string into codepoints, and also collects the-- starting position of each character and the total length in codepoints.---- @param s string utf8-encoded string to decode-- @return tablelocal function utf8_explode(s) local ret =
local i = 1 local l = S.len(s) local cp, b, b2, trail local min while i <= l do b = S.byte(s, i) if b < 0x80 then -- 1-byte code point, 00-7F cp = b trail = 0 min = 0 elseif b < 0xc2 then -- Either a non-initial code point (invalid here) or -- an overlong encoding for a 1-byte code point return nil elseif b < 0xe0 then -- 2-byte code point, C2-DF trail = 1 cp = b - 0xc0 min = 0x80 elseif b < 0xf0 then -- 3-byte code point, E0-EF trail = 2 cp = b - 0xe0 min = 0x800 elseif b < 0xf4 then -- 4-byte code point, F0-F3 trail = 3 cp = b - 0xf0 min = 0x10000 elseif b
-- Check subsequent bytes for multibyte code points for j = i + 1, i + trail do b = S.byte(s, j) if not b or b < 0x80 or b > 0xbf then return nil end cp = cp * 0x40 + b - 0x80 end if cp < min then -- Overlong encoding return nil end
ret.codepoints[#ret.codepoints + 1] = cp ret.bytepos[#ret.bytepos + 1] = i ret.len = ret.len + 1 i = i + 1 + trail end
-- Two past the end (for sub with empty string) ret.bytepos[#ret.bytepos + 1] = l + 1 ret.bytepos[#ret.bytepos + 1] = l + 1
return retend
-- A private helper that finds the character offset for a byte offset.---- @param cps table from utf8_explode-- @param i int byte offset-- @return intlocal function cpoffset(cps, i) local min, max, p = 0, cps.len + 1 if i
---- Trivial functions ------ These functions are the same as the standard string versions
ustring.byte = string.byteustring.format = string.formatustring.rep = string.rep
---- Non-trivial functions ------ These functions actually have to be UTF-8 aware
-- Determine if a string is valid UTF-8---- @param s string-- @return booleanfunction ustring.isutf8(s) checkString('isutf8', s) return utf8_explode(s) ~= nilend
-- Return the byte offset of a character in a string---- @param s string-- @param l int codepoint number [default 1]-- @param i int starting byte offset [default 1]-- @return int|nilfunction ustring.byteoffset(s, l, i) checkString('byteoffset', s) checkType('byteoffset', 2, l, 'number', true) checkType('byteoffset', 3, i, 'number', true) local cps = utf8_explode(s) if cps
i = i or 1 if i < 0 then i = S.len(s) + i + 1 end if i < 1 or i > S.len(s) then return nil end local p = cpoffset(cps, i) if l > 0 and cps.bytepos[p]
-- Return codepoints from a string---- @see string.byte-- @param s string-- @param i int Starting character [default 1]-- @param j int Ending character [default i]-- @return int* Zero or more codepointsfunction ustring.codepoint(s, i, j) checkString('codepoint', s) checkType('codepoint', 2, i, 'number', true) checkType('codepoint', 3, j, 'number', true) local cps = utf8_explode(s) if cps
-- Return an iterator over the codepoint (as integers)-- for cp in ustring.gcodepoint(s) do ... end---- @param s string-- @param i int Starting character [default 1]-- @param j int Ending character [default -1]-- @return function-- @return nil-- @return nilfunction ustring.gcodepoint(s, i, j) checkString('gcodepoint', s) checkType('gcodepoint', 2, i, 'number', true) checkType('gcodepoint', 3, j, 'number', true) local cp = return function return table.remove(cp, 1) endend
-- Convert codepoints to a string---- @see string.char-- @param ... int List of codepoints-- @return stringlocal function internalChar(t, s, e) local ret = for i = s, e do local v = t[i] if type(v) ~= 'number' then checkType('char', i, v, 'number') end v = math.floor(v) if v < 0 or v > 0x10ffff then error(S.format("bad argument #%d to 'char' (value out of range)", i), 2) elseif v < 0x80 then ret[#ret + 1] = v elseif v < 0x800 then ret[#ret + 1] = 0xc0 + math.floor(v / 0x40) % 0x20 ret[#ret + 1] = 0x80 + v % 0x40 elseif v < 0x10000 then ret[#ret + 1] = 0xe0 + math.floor(v / 0x1000) % 0x10 ret[#ret + 1] = 0x80 + math.floor(v / 0x40) % 0x40 ret[#ret + 1] = 0x80 + v % 0x40 else ret[#ret + 1] = 0xf0 + math.floor(v / 0x40000) % 0x08 ret[#ret + 1] = 0x80 + math.floor(v / 0x1000) % 0x40 ret[#ret + 1] = 0x80 + math.floor(v / 0x40) % 0x40 ret[#ret + 1] = 0x80 + v % 0x40 end end return S.char(unpack(ret))endfunction ustring.char(...) return internalChar(1, select('#', ...))end
-- Return the length of a string in codepoints, or-- nil if the string is not valid UTF-8.---- @see string.len-- @param string-- @return int|nilfunction ustring.len(s) checkString('len', s) local cps = utf8_explode(s) if cps
-- Private function to return a substring of a string---- @param s string-- @param cps table Exploded string-- @param i int Starting character [default 1]-- @param j int Ending character [default -1]-- @return stringlocal function sub(s, cps, i, j) return S.sub(s, cps.bytepos[i], cps.bytepos[j+1] - 1)end
-- Return a substring of a string---- @see string.sub-- @param s string-- @param i int Starting character [default 1]-- @param j int Ending character [default -1]-- @return stringfunction ustring.sub(s, i, j) checkString('sub', s) checkType('sub', 2, i, 'number', true) checkType('sub', 3, j, 'number', true) local cps = utf8_explode(s) if cps
---- Table-driven functions ------ These functions load a conversion table when called
-- Convert a string to uppercase---- @see string.upper-- @param s string-- @return stringfunction ustring.upper(s) checkString('upper', s) local map = require 'ustring/upper'; local ret = S.gsub(s, '([^\128-\191][\128-\191]*)', map) return retend
-- Convert a string to lowercase---- @see string.lower-- @param s string-- @return stringfunction ustring.lower(s) checkString('lower', s) local map = require 'ustring/lower'; local ret = S.gsub(s, '([^\128-\191][\128-\191]*)', map) return retend
---- Pattern functions ------ Ugh. Just ugh.
-- Cache for character sets (e.g. [a-z])local charset_cache = setmetatable(charset_cache,)
-- Private function to find a pattern in a string-- Yes, this basically reimplements the whole of Lua's pattern matching, in-- Lua.---- @see ustring.find-- @param s string-- @param cps table Exploded string-- @param rawpat string Pattern-- @param pattern table Exploded pattern-- @param init int Starting index-- @param noAnchor boolean True to ignore '^'-- @return int starting index of the match-- @return int ending index of the match-- @return string|int* captureslocal function find(s, cps, rawpat, pattern, init, noAnchor) local charsets = require 'ustring/charsets' local anchor = false local ncapt, captures local captparen =
-- Extract the value of a capture from the -- upvalues ncapt and capture. local function getcapt(n, err, errl) if n > ncapt then error(err, errl + 1) elseif type(captures[n])
then error(err, errl + 1) end return sub(s, cps, captures[n][1], captures[n][2]), captures[n][2] - captures[n][1] + 1 else return captures[n], math.floor(math.log10(captures[n])) + 1 end end
local match, match_charset, parse_charset
-- Main matching function. Uses tail recursion where possible. -- Returns the position of the character after the match, and updates the -- upvalues ncapt and captures. match = function (sp, pp) local c = pattern.codepoints[pp] if c
0x29 then -- ')': Pattern is '', capture position captures[ncapt] = sp ret = match(sp, pp + 2) else -- Start capture group captures[ncapt] = ret = match(sp, pp + 1) end if ret then return ret else -- Failed, rollback ncapt = ncapt - 1 return nil end elseif c
'table' and captures[n][2]
0x5b then -- '[': starts character set return match_charset(sp, parse_charset(pp)) elseif c == 0x5d then -- ']' error('Unmatched close-bracket at pattern character ' .. pp, 3) elseif c
0x62 then -- '%b': balanced delimiter match local d1 = pattern.codepoints[pp + 2] local d2 = pattern.codepoints[pp + 3] if not d1 or not d2 then error('malformed pattern (missing arguments to \'%b\')', 3) end if cps.codepoints[sp] ~= d1 then return nil end sp = sp + 1 local ct = 1 while true do c = cps.codepoints[sp] sp = sp + 1 if not c then return nil elseif c
1 then return match(sp, pp + 4) end ct = ct - 1 elseif c
0x66 then -- '%f': frontier pattern match if pattern.codepoints[pp + 2] ~= 0x5b then error('missing \'[\' after %f in pattern at pattern character ' .. pp, 3) end local pp, charset = parse_charset(pp + 2) local c1 = cps.codepoints[sp - 1] or 0 local c2 = cps.codepoints[sp] or 0 if not charset[c1] and charset[c2] then return match(sp, pp) else return nil end elseif c >= 0x30 and c <= 0x39 then -- '%0' to '%9': backreference local m, l = getcapt(c - 0x30, 'invalid capture index %' .. c .. ' at pattern character ' .. pp, 3) local ep = math.min(cps.len + 1, sp + l) if sub(s, cps, sp, ep - 1)
0x2e then -- '.': match anything if not charset_cache['.'] then local t = setmetatable(t,) charset_cache['.'] = end return match_charset(sp, pp + 1, charset_cache['.'][2]) elseif c
0x24 and pattern.len
cps.len + 1) and sp or nil else -- Any other character matches itself return match_charset(sp, pp + 1,) end end
-- Parse a bracketed character set (e.g. [a-z]) -- Returns the position after the set and a table holding the matching characters parse_charset = function (pp) local _, ep local epp = pattern.bytepos[pp] repeat _, ep = S.find(rawpat, ']', epp, true) if not ep then error('Missing close-bracket for character set beginning at pattern character ' .. pp, 3) end epp = ep + 1 until S.byte(rawpat, ep - 1) ~= 0x25 or S.byte(rawpat, ep - 2)
local p0 = pp local cs = local csrefs = local invert = false pp = pp + 1 if pattern.codepoints[pp]
0x25 then -- '%' c = pattern.codepoints[pp + 1] if charsets[c] then csrefs[#csrefs + 1] = charsets[c] else cs[c] = 1 end pp = pp + 2 elseif pattern.codepoints[pp + 1]
0x5d then -- closing ']' pp = pp + 1 break elseif not c then -- Should never get here, but Just In Case... error('Missing close-bracket', 3) else cs[c] = 1 pp = pp + 1 end end
local ret if not csrefs[2] then if not invert then -- If there's only the one charset table, we can use it directly ret = cs else -- Simple invert ret = setmetatable(ret,) end else -- Ok, we have to iterate over multiple charset tables ret = setmetatable(ret,) end
charset_cache[key] = return pp, ret end
-- Match a character set table with optional quantifier, followed by -- the rest of the pattern. -- Returns same as 'match' above. match_charset = function (sp, pp, charset) local q = pattern.codepoints[pp] if q
0x2b then -- '+', 1 or more matches pp = pp + 1 local i = 0 while charset[cps.codepoints[sp + i]] do i = i + 1 end while i > 0 do local ret = match(sp + i, pp) if ret then return ret end i = i - 1 end return nil elseif q
0x3f then -- '?', 0 or 1 match pp = pp + 1 if charset[cps.codepoints[sp]] then local ret = match(sp + 1, pp) if ret then return ret end end return match(sp, pp) else -- no suffix, must match 1 if charset[cps.codepoints[sp]] then return match(sp + 1, pp) else return nil end end end
init = init or 1 if init < 0 then init = cps.len + init + 1 end init = math.max(1, math.min(init, cps.len + 1))
-- Here is the actual match loop. It just calls 'match' on successive -- starting positions (or not, if the pattern is anchored) until it finds a -- match. local sp = init local pp = 1 if not noAnchor and pattern.codepoints[1]
repeat ncapt, captures = 0, local ep = match(sp, pp) if ep then for i = 1, ncapt do captures[i] = getcapt(i, 'Unclosed capture beginning at pattern character ' .. captparen[pp], 2) end return sp, ep - 1, unpack(captures) end sp = sp + 1 until anchor or sp > cps.len + 1 return nilend
-- Private function to decide if a pattern looks simple enough to use-- Lua's built-in string library. The following make a pattern not simple:-- * If it contains any bytes over 0x7f. We could skip these if they're not-- inside brackets and aren't followed by quantifiers and aren't part of a-- '%b', but that's too complicated to check.-- * If it contains a negated character set.-- * If it contains "%a" or any of the other %-prefixed character sets except-- %z or %Z.-- * If it contains a '.' not followed by '*', '+', or '-'. A bare '.' or '.?'-- would try to match a partial UTF-8 character, but the others will happily-- enough match a whole character thinking it's 2 or 4.-- * If it contains position-captures.---- @param string pattern-- @return booleanlocal function patternIsSimple(pattern) return not (S.find(pattern, '[\128-\255]') or S.find(pattern, '%[%^') or S.find(pattern, '%%[acdlpsuwxACDLPSUWX]') or S.find(pattern, '%.[^*+-]') or S.find(pattern, '', 1, true) )end
-- Find a pattern in a string---- This works just like string.find, with the following changes:-- * Everything works on UTF-8 characters rather than bytes-- * Character classes are redefined in terms of Unicode properties:-- * %a - Letter-- * %c - Control-- * %d - Decimal Number-- * %l - Lower case letter-- * %p - Punctuation-- * %s - Separator, plus HT, LF, FF, CR, and VT-- * %u - Upper case letter-- * %w - Letter or Decimal Number-- * %x - [0-9A-Fa-f0-9A-Fa-f]---- @see string.find-- @param s string-- @param pattern string Pattern-- @param init int Starting index-- @param plain boolean Literal match, no pattern matching-- @return int starting index of the match-- @return int ending index of the match-- @return string|int* capturesfunction ustring.find(s, pattern, init, plain) checkString('find', s) checkPattern('find', pattern) checkType('find', 3, init, 'number', true) checkType('find', 4, plain, 'boolean', true) local cps = utf8_explode(s) if cps
nil then error("bad argument #2 for 'find' (string is not UTF-8)", 2) end
if plain or patternIsSimple(pattern) then if init and init > cps.len + 1 then init = cps.len + 1 end local m = if m[1] then m[1] = cpoffset(cps, m[1]) m[2] = cpoffset(cps, m[2]) end return unpack(m) end
return find(s, cps, pattern, pat, init)end
-- Match a string against a pattern---- @see ustring.find-- @see string.match-- @param s string-- @param pattern string-- @param init int Starting offset for match-- @return string|int* captures, or the whole match if there are nonefunction ustring.match(s, pattern, init) checkString('match', s) checkPattern('match', pattern) checkType('match', 3, init, 'number', true) local cps = utf8_explode(s) if cps
nil then error("bad argument #2 for 'match' (string is not UTF-8)", 2) end
if patternIsSimple(pattern) then return S.match(s, pattern, cps.bytepos[init]) end
local m = if not m[1] then return nil end if m[3] then return unpack(m, 3) end return sub(s, cps, m[1], m[2])end
-- Return an iterator function over the matches for a pattern---- @see ustring.find-- @see string.gmatch-- @param s string-- @param pattern string-- @return function-- @return nil-- @return nilfunction ustring.gmatch(s, pattern) checkString('gmatch', s) checkPattern('gmatch', pattern) if patternIsSimple(pattern) then return S.gmatch(s, pattern) end
local cps = utf8_explode(s) if cps
nil then error("bad argument #2 for 'gmatch' (string is not UTF-8)", 2) end local init = 1
return function local m = if not m[1] then return nil end init = m[2] + 1 if m[3] then return unpack(m, 3) end return sub(s, cps, m[1], m[2]) endend
-- Replace pattern matches in a string---- @see ustring.find-- @see string.gsub-- @param s string-- @param pattern string-- @param repl string|function|table-- @param int n-- @return string-- @return intfunction ustring.gsub(s, pattern, repl, n) checkString('gsub', s) checkPattern('gsub', pattern) checkType('gsub', 4, n, 'number', true) if patternIsSimple(pattern) then return S.gsub(s, pattern, repl, n) end
local cps = utf8_explode(s) if cps
nil then error("bad argument #2 for 'gsub' (string is not UTF-8)", 2) end if n
if pat.codepoints[1]
local tp if type(repl)
'table' then tp = 2 elseif type(repl)
'number' then repl = tostring(repl) tp = 3 else checkType('gsub', 3, repl, 'function or table or string') end
local init = 1 local ct = 0 local ret = while init < cps.len and ct < n do local m = if not m[1] then break end if init < m[1] then ret[#ret + 1] = sub(s, cps, init, m[1] - 1) end local mm = sub(s, cps, m[1], m[2]) local val if tp
2 then val = repl[m[3] or mm] elseif tp
0 and #m < 11 then local ss = S.gsub(repl, '%%[%%0-' .. (#m - 2) .. ']', 'x') ss = S.match(ss, '%%[0-9]') if ss then error('invalid capture index ' .. ss .. ' in replacement string', 2) end end local t = val = S.gsub(repl, '%%[%%0-9]', t) end ret[#ret + 1] = val or mm init = m[2] + 1 ct = ct + 1 end if init <= cps.len then ret[#ret + 1] = sub(s, cps, init, cps.len) end return table.concat(ret), ctend
---- Unicode Normalization ------ These functions load a conversion table when called
local function internalToNFD(cps) local cp = local normal = require 'ustring/normalization-data'
-- Decompose into cp, using the lookup table and logic for hangul for i = 1, cps.len do local c = cps.codepoints[i] local m = normal.decomp[c] if m then for j = 0, #m do cp[#cp + 1] = m[j] end else cp[#cp + 1] = c end end
-- Now sort combiners by class local i, l = 1, #cp while i < l do local cc1 = normal.combclass[cp[i]] local cc2 = normal.combclass[cp[i+1]] if cc1 and cc2 and cc1 > cc2 then cp[i], cp[i+1] = cp[i+1], cp[i] if i > 1 then i = i - 1 else i = i + 1 end else i = i + 1 end end
return cp, 1, lend
-- Normalize a string to NFC---- Based on MediaWiki's UtfNormal class. Returns nil if the string is not valid-- UTF-8.---- @param s string-- @return string|nilfunction ustring.toNFC(s) checkString('toNFC', s)
-- ASCII is always NFC if not S.find(s, '[\128-\255]') then return s end
local cps = utf8_explode(s) if cps
-- First, scan through to see if the string is definitely already NFC local ok = true for i = 1, cps.len do local c = cps.codepoints[i] if normal.check[c] then ok = false break end end if ok then return s end
-- Next, expand to NFD local cp, _, l = internalToNFD(cps)
-- Then combine to NFC. Since NFD->NFC can never expand a character -- sequence, we can do this in-place. local comp = normal.comp[cp[1]] local sc = 1 local j = 1 local lastclass = 0 for i = 2, l do local c = cp[i] local ccc = normal.combclass[c] if ccc then -- Trying a combiner with the starter if comp and lastclass < ccc and comp[c] then -- Yes! c = comp[c] cp[sc] = c comp = normal.comp[c] else -- No, copy it to the right place for output j = j + 1 cp[j] = c lastclass = ccc end elseif comp and lastclass
return internalChar(cp, 1, j)end
-- Normalize a string to NFD---- Based on MediaWiki's UtfNormal class. Returns nil if the string is not valid-- UTF-8.---- @param s string-- @return string|nilfunction ustring.toNFD(s) checkString('toNFD', s)
-- ASCII is always NFC if not S.find(s, '[\128-\255]') then return s end
local cps = utf8_explode(s) if cps
return internalChar(internalToNFD(cps))end
return ustring