Module:Sandbox/PHansen/URLutil explained

-- From -- Via

-- Descriptions-- en: -- de:

-- Test : -- Wikidata:

--[=[ URLutil 2014-09-20 Utilities for URL etc. on www. * getAuthority * getFragment * getHost * getLocation * getPath * getPort * getQuery * getQueryTable * getRelativePath * getScheme * getTLD * getTop2domain * getTop3domain * isAuthority * isDomain * isDomainExample * isDomainInt * isHost * isIP * isIPlocal * isIPv4 * isIPv6 * isMailAddress * isMailLink * isProtocolDialog * isProtocolWiki * isResourceURL * isSuspiciousURL * isUnescapedURL * isWebURL * wikiEscapeURL Only [[dotted decimal]] notation for IPv4 supported.Does not support dotted hexadecimal, dotted octal, or single-number formats.IPv6 URL (bracketed) not yet implemented; might need Wikintax escaping anyway.]=]

-- table for exportlocal URLutil =

URLutil.getURIScheme = function (uri) if type(uri)

"string" then local prot, colon, slashes = uri:match("^%s*([a-zA-Z]*)(:?)(/?/?)") if #colon

1 and #prot >= 2 then return prot:lower elseif #slashes

2 and #prot

0 then return "//" end end return falseend -- getURIScheme

local getTopDomain = function (url, mode) local r = URLutil.getHost(url) if r then local pattern = "[%w%%]+%.%a[%w-]*%a)$" if mode

3 then pattern = "[%w%%]+%." .. pattern end r = mw.ustring.match("." .. r, "%.(" .. pattern) if not r then r = false end else r = false end return rend -- getTopDomain

URLutil.getAuthority = function (url) local r if type(url)

"string" then local colon, host, port local pattern = "^%s*%w*:?//([%w%.%%-]+)(:?)([%d]*)/" local s = mw.text.decode(url) local i = s:find("#", 6, true) if i then s = s:sub(1, i - 1) .. "/" else s = s .. "/" end host, colon, port = mw.ustring.match(s, pattern) if URLutil.isHost(host) then host = mw.ustring.lower(host) if colon

":" then if port:find("^[1-9]") then r = (host .. ":" .. port) end elseif #port

0 then r = host end end else r = false end return rend -- URLutil.getAuthority

URLutil.getFragment = function (url, decode) local r if type(url)

"string" then local s = mw.text.decode(url) local i = s:find("#", 1, true) if i then r = mw.text.trim(s:sub(i)):sub(2) if type(decode)

"string" then local encoding = mw.text.trim(decode) local launch if encoding

"%" then launch = true elseif encoding

"WIKI" then r = r:gsub("%.(%x%x)", "%%%1") :gsub("_", " ") launch = true end if launch then r = mw.uri.decode(r, "PATH") end end else r = false end else r = nil end return rend -- URLutil.getFragment

URLutil.getHost = function (url) local r = URLutil.getAuthority(url) if r then r = mw.ustring.match(r, "^([%w%.%%-]+):?[%d]*$") end return rend -- URLutil.getHost

URLutil.getLocation = function (url) local r if type(url)

"string" then r = mw.text.trim(url) if r

"" then r = false else local i r = mw.text.decode(r) i = r:find("#", 1, true) if i then if i

1 then r = false else r = r:sub(1, i - 1) end end end else r = nil end return rend -- URLutil.getLocation

URLutil.getPath = function (url) local r = URLutil.getRelativePath(url) if r then local s = r:match("^([^%?]*)%?") if s then r = s end s = r:match("^([^#]*)#") if s then r = s end end return rend -- URLutil.getPath

URLutil.getPort = function (url) local r = URLutil.getAuthority(url) if r then r = r:match(":([1-9][0-9]*)$") if r then r = tonumber(r) else r = false end end return rend -- URLutil.getPort

URLutil.getQuery = function (url, key, separator) local r = URLutil.getLocation(url) if r then r = r:match("^[^%?]*%?(.+)$") if r then if type(key)

"string" then local single = mw.text.trim(key) local sep = "&" local s, scan if type(separator)

"string" then s = mw.text.trim(separator) if s:match("^[&;,/]$") then sep = s end end s = string.format("%s%s%s", sep, r, sep) scan = string.format("%s%s=([^%s]*)%s", sep, key, sep, sep) r = s:match(scan) end end if not r then r = false end end return rend -- URLutil.getQuery

URLutil.getQueryTable = function (url, separator) local r = URLutil.getQuery(url) if r then local sep = "&" local n, pairs, s, set if type(separator)

"string" then s = mw.text.trim(separator) if s:match("^[&;,/]$") then sep = s end end pairs = mw.text.split(r, sep, true) n = #pairs r = for i = 1, n do s = pairs[i ] if s:find("=", 2, true) then s, set = s:match("^([^=]+)=(.*)$") if s then r[s ] = set end else r[s ] = false end end -- for i end return rend -- URLutil.getQueryTable

URLutil.getRelativePath = function (url) local r if type(url)

"string" then local s = url:match("^%s*[a-zA-Z]*://(.*)$") if s then s = s:match("[^/]+(/.*)$") else local x x, s = url:match("^%s*(/?)(/.*)$") if x

"/" then s = s:match("/[^/]+(/.*)$") end end if s then r = mw.text.trim(s) elseif URLutil.isResourceURL(url) then r = "/" else r = false end else r = nil end return rend -- URLutil.getRelativePath

URLutil.getScheme = function (url) local r if type(url)

"string" then local pattern = "^%s*([a-zA-Z]*)(:?)(//)" local prot, colon, slashes = url:match(pattern) r = false if slashes

"//" then if colon

":" then if #prot > 2 then r = prot:lower .. "://" end elseif #prot

0 then r = "//" end end else r = nil end return rend -- URLutil.getScheme

URLutil.getTLD = function (url) local r = URLutil.getHost(url) if r then r = mw.ustring.match(r, "[%w]+%.(%a[%w-]*%a)$") if not r then r = false end end return rend -- URLutil.getTLD

URLutil.getTop2domain = function (url) return getTopDomain(url, 2)end -- URLutil.getTop2domain

URLutil.getTop3domain = function (url) return getTopDomain(url, 3)end -- URLutil.getTop3domain

URLutil.isAuthority = function (s) local r if type(s)

"string" then local pattern = "^%s*([%w%.%%-]+)(:?)(%d*)%s*$" local host, colon, port = mw.ustring.match(s, pattern) if colon

":" then port = port:match("^[1-9][0-9]*$") if type(port) ~= "string" then r = false end elseif port ~= "" then r = false end r = URLutil.isHost(host) else r = nil end return rend -- URLutil.isAuthority

URLutil.isDomain = function (s) local r if type(s)

"string" then local scan = "^%s*([%w%.%%-]+%w)%.(%a[%w-]*%a)%s*$" local scope s, scope = mw.ustring.match(s, scan) if type(s)

"string" then if mw.ustring.find(s, "^%w") then if mw.ustring.find(s, "..", 1, true) then r = false else r = true end end end else r = nil end return rend -- URLutil.isDomain

URLutil.isDomainExample = function (url) -- RFC 2606: example.com example.net example.org example.edu local r = getTopDomain(url, 2) if r then local s = r:lower:match("^example%.([a-z][a-z][a-z])$") if s then r = (s

"com" or s

"edu" or s

"net" or s

"org") else r = false end end return rend -- URLutil.isDomainExample

URLutil.isDomainInt = function (url) -- Internationalized Domain Name (Punycode) local r = URLutil.getHost(url) if r then if r:match("^[!-~]+$") then local s = "." .. r if s:find(".xn--", 1, true) then r = true else r = false end else r = true end end return rend -- URLutil.isDomainInt

URLutil.isHost = function (s) return URLutil.isDomain(s) or URLutil.isIP(s)end -- URLutil.isHost

URLutil.isIP = function (s) return URLutil.isIPv4(s) and 4 or URLutil.isIPv6(s) and 6end -- URLutil.isIP

URLutil.isIPlocal = function (s) -- IPv4 according to RFC 1918, RFC 1122; even any 0.0.0.0 (RFC 5735) local r = false local num = s:match("^ *([01][0-9]*)%.") if num then num = tonumber(num) if num

0 then r = s:match("^ *0+%.[0-9]+%.[0-9]+%.[0-9]+ *$") elseif num

10 or num

127 then -- loopback; private/local host: 127.0.0.1 r = URLutil.isIPv4(s) elseif num

169 then -- 169.254.*.* elseif num

172 then -- 172.(16...31).*.* num = s:match("^ *0*172%.([0-9]+)%.") if num then num = tonumber(num) if num >= 16 and num <= 31 then r = URLutil.isIPv4(s) end end elseif beg

192 then -- 192.168.*.* num = s:match("^ *0*192%.([0-9]+)%.") if num then num = tonumber(num) if num

168 then r = URLutil.isIPv4(s) end end end end if r then r = true end return rend -- URLutil.isIPlocal

URLutil.isIPv4 = function (s) local function legal(n) return (tonumber(n) < 256) end local r = false if type(s)

"string" then local p1, p2, p3, p4 = s:match("^%s*([1-9][0-9]?[0-9]?)%.([12]?[0-9]?[0-9])%.([12]?[0-9]?[0-9])%.([12]?[0-9]?[0-9])%s*$") if p1 and p2 and p3 and p4 then r = legal(p1) and legal(p2) and legal(p3) and legal(p4) end end return rend -- URLutil.isIPv4

URLutil.isIPv6 = function (s) local dcolon, groups if type(s) ~= "string" or s:len

0 or s:find("[^:%x]") -- only colon and hex digits are legal chars or s:find("^:[^:]") -- can begin or end with :: but not with single : or s:find("[^:]:$") or s:find(":::") then return false end s = mw.text.trim(s) s, dcolon = s:gsub("::", ":") if dcolon > 1 then return false end -- at most one :: s = s:gsub("^:?", ":") -- prepend : if needed, upper s, groups = s:gsub(":%x%x?%x?%x?", "") -- remove valid groups, and count them return ((dcolon

1 and groups < 8) or (dcolon

0 and groups

8)) and (s:len

0 or (dcolon

1 and s

":")) -- might be one dangling : if original ended with ::end -- URLutil.isIPv6

URLutil.isMailAddress = function (s) if type(s)

"string" then s = mw.ustring.match(s, "^%s*[%w%.%%_-]+@([%w%.%%-]+)%s*$") return URLutil.isDomain(s) end return falseend -- URLutil.isMailAddress

URLutil.isMailLink = function (s) if type(s)

"string" then local addr s, addr = mw.ustring.match(s, "^%s*([Mm][Aa][Ii][Ll][Tt][Oo]):(%S[%w%.%%_-]*@[%w%.%%-]+)%s*$") if type(s)

"string" then if s:lower

"mailto" then return URLutil.isMailAddress(addr) end end end return falseend -- URLutil.isMailLink

local function isProtocolAccepted(prot, supplied) if type(prot)

"string" then local scheme, colon, slashes = mw.ustring.match(prot, "^%s*([a-zA-Z]*)(:?)(/?/?)%s*$") if slashes ~= "/" then if scheme

"" then if colon ~= ":" and slashes

"//" then return true end elseif colon

":" or slashes

"" then local s = supplied:match(" " .. scheme:lower .. " ") if type(s)

"string" then return true end end end end return falseend -- isProtocolAccepted

URLutil.isProtocolMW = function (prot) return isProtocolAccepted(prot, " http https ftp ftps ssh sftp irc ircs xmpp sip sips gopher telnet nntp worldwind mailto tel sms news svn git mms bitcoin magnet urn geo ")end -- URLutil.isProtocolMW

URLutil.isProtocolDialog = function (prot) return isProtocolAccepted(prot, " mailto irc ircs ssh telnet ")end -- URLutil.isProtocolDialog

URLutil.isProtocolWiki = function (prot) return isProtocolAccepted(prot, " ftp ftps git http https nntp sftp svn worldwind ")end -- URLutil.isProtocolWiki

URLutil.isResourceURL = function (url) local scheme = URLutil.getScheme(url) if scheme then local s = " // http:// https:// ftp:// sftp:// " s = s:find(string.format(" %s ", scheme)) if s then if URLutil.getAuthority(url) then if not url:match("%S%s+%S") then return true end end end end return falseend -- URLutil.isResourceURL

URLutil.isSuspiciousURL = function (url) if URLutil.isResourceURL(url) then local s = URLutil.getAuthority(url) local pat = "[%[|%]" .. mw.ustring.char(8201, 45, 8207, 8234, 45, 8239, 8288) .. "]" if s:find("@") or url:find("") or url:find(pat) or url:find("[%.,]$") then return true end -- TODO zero width character ?? return false end return trueend -- URLutil.isSuspiciousURL

URLutil.isUnescapedURL = function (url, trailing) if type(trailing) ~= "string" then if URLutil.isWebURL(url) then if url:match("[%[|%]]") then return true end end end return falseend -- URLutil.isUnescapedURL

URLutil.isWebURL = function (url) if URLutil.getScheme(url) and URLutil.getAuthority(url) then if not url:match("%S%s+%S") then return true end end return falseend -- URLutil.isWebURL

URLutil.wikiEscapeURL = function (url) if url:find("[%[|%]]") then local n url, n = url:gsub("%[", "&#91;") :gsub("|", "&#124;") :gsub("%]", "]") end return urlend -- URLutil.wikiEscapeURL

-- Provide template access and expose URLutil table to require

local p =

function p.getURIScheme(frame) return URLutil.getURIScheme(frame.args[1 ]) or ""endfunction p.getAuthority(frame) return URLutil.getAuthority(frame.args[1 ]) or ""endfunction p.getFragment(frame) local r = URLutil.getFragment(frame.args[1 ], frame.args[2 ]) if r then r = "#" .. r else r = "" end return rendfunction p.getHost(frame) return URLutil.getHost(frame.args[1 ]) or ""endfunction p.getLocation(frame) return URLutil.getLocation(frame.args[1 ]) or ""endfunction p.getPath(frame) return URLutil.getPath(frame.args[1 ]) or ""endfunction p.getPort(frame) return URLutil.getPort(frame.args[1 ]) or ""endfunction p.getQuery(frame) local r local key = frame.args[2 ] if key then key = mw.text.trim(key) if key

"" then key = nil end end r = URLutil.getQuery(frame.args[1 ], key, frame.args[3 ]) if r then if not key then r = "?" .. r end else r = "" end return rendfunction p.getRelativePath(frame) return URLutil.getRelativePath(frame.args[1 ]) or ""endfunction p.getScheme(frame) return URLutil.getScheme(frame.args[1 ]) or ""endfunction p.getTLD(frame) return URLutil.getTLD(frame.args[1 ]) or ""endfunction p.getTop2domain(frame) return URLutil.getTop2domain(frame.args[1 ]) or ""endfunction p.getTop3domain(frame) return URLutil.getTop3domain(frame.args[1 ]) or ""endfunction p.isAuthority(frame) return URLutil.isAuthority(frame.args[1 ]) and "1" or ""endfunction p.isDomain(frame) return URLutil.isDomain(frame.args[1 ]) and "1" or ""endfunction p.isDomainExample(frame) return URLutil.isDomainExample(frame.args[1 ]) and "1" or ""endfunction p.isDomainInt(frame) return URLutil.isDomainInt(frame.args[1 ]) and "1" or ""endfunction p.isHost(frame) return URLutil.isHost(frame.args[1 ]) and "1" or ""endfunction p.isIP(frame) return URLutil.isIP(frame.args[1 ]) or ""endfunction p.isIPlocal(frame) return URLutil.isIPlocal(frame.args[1 ]) and "1" or ""endfunction p.isIPv4(frame) return URLutil.isIPv4(frame.args[1 ]) and "1" or ""endfunction p.isIPv6(frame) return URLutil.isIPv6(frame.args[1 ]) and "1" or ""endfunction p.isMailAddress(frame) return URLutil.isMailAddress(frame.args[1 ]) and "1" or ""endfunction p.isMailLink(frame) return URLutil.isMailLink(frame.args[1 ]) and "1" or ""endfunction p.isProtocolMW(frame) return URLutil.isProtocolMW(frame.args[1 ]) and "1" or ""endfunction p.isProtocolDialog(frame) return URLutil.isProtocolDialog(frame.args[1 ]) and "1" or ""endfunction p.isProtocolWiki(frame) return URLutil.isProtocolWiki(frame.args[1 ]) and "1" or ""endfunction p.isResourceURL(frame) return URLutil.isResourceURL(frame.args[1 ]) and "1" or ""endfunction p.isSuspiciousURL(frame) return URLutil.isSuspiciousURL(frame.args[1 ]) and "1" or ""endfunction p.isUnescapedURL(frame) return URLutil.isUnescapedURL(frame.args[1 ], frame.args[2 ]) and "1" or ""endfunction p.isWebURL(frame) return URLutil.isWebURL(frame.args[1 ]) and "1" or ""endfunction p.wikiEscapeURL(frame) return URLutil.wikiEscapeURL(frame.args[1 ])endfunction p.URLutil return URLutilend

return p