Módulo:String/avanzado
La documentación para este módulo puede ser creada en Módulo:String/avanzado/doc
-- tomado de https://en.wiktionary.org/wiki/Module:utilities
-- y de https://en.wiktionary.org/wiki/Module:links
local decode = mw.text.decode
local u = mw.ustring.char
local export = {}
-- A helper function to resolve HTML entities into plaintext.
do
local entities
local function get_named_entity(entity)
entities = entities or mw.loadData("Módulo:datos/entidades")
return entities[entity]
end
-- Catches entities with capital X, which aren't supported by default.
local function get_numbered_entity(entity)
entity = entity:lower()
local ret = decode(entity)
if ret ~= entity then
return ret
end
end
function export.get_entities(text)
return (text:gsub("&([^#&;]+);", get_named_entity)
:gsub("&#[Xx]?%x+;", get_numbered_entity)
)
end
end
--function export.get_entities(text)
-- return (text:gsub("&([^#&;]+);", get_named_entity)
-- :gsub("&#[Xx]?%x+;", get_numbered_entity)
-- )
--end
-- A helper function to convert plaintext into HTML entities where these match the characters given in set.
-- By default, this resolves any pre-existing entities into plaintext first, to allow mixed input and to avoid accidental double-conversion. This can be turned off with the raw parameter.
function export.make_entities(text, set, raw)
text = not raw and export.get_entities(text) or text
return mw.text.encode(text, set)
end
function export.remove_links(text, tag)
if type(text) == "table" then
text = text.args[1]
end
if not text or text == "" then
return ""
end
text = text
:gsub("%[%[", "\1")
:gsub("%]%]", "\2")
-- Parse internal links for the display text.
text = text:gsub("(\1)([^\1\2]-)(\2)",
function(c1, c2, c3)
-- Don't remove files.
for _, falsePositive in ipairs({"file", "image", "archivo", "imagen"}) do
if c2:lower():match("^" .. falsePositive .. ":") then return c1 .. c2 .. c3 end
end
-- Remove categories completely.
for _, falsePositive in ipairs({"category", "categoría", "cat"}) do
if c2:lower():match("^" .. falsePositive .. ":") then return "" end
end
-- In piped links, remove all text before the pipe, unless it's the final character (i.e. the pipe trick), in which case just remove the pipe.
c2 = c2:match("^[^|]*|(.+)") or c2:match("([^|]+)|$") or c2
if tag then
return "<link>" .. c2 .. "</link>"
else
return c2
end
end)
text = text
:gsub("\1", "[[")
:gsub("\2", "]]")
return text
end
-- A helper function to strip wiki markup, giving the plaintext of what is displayed on the page.
function export.get_plaintext(text)
text = text
:gsub("%[%[", "\1")
:gsub("%]%]", "\2")
-- Remove strip markers and HTML tags.
text = mw.text.unstrip(text)
:gsub("<[^<>\1\2]+>", "")
-- Parse internal links for the display text, and remove categories.
text = export.remove_links(text)
-- Remove files.
for _, falsePositive in ipairs({"File", "Image"}) do
text = text:gsub("\1" .. falsePositive .. ":[^\1\2]+\2", "")
end
-- Parse external links for the display text.
text = text:gsub("%[(https?://[^%[%]]+)%]",
function(capture)
return capture:match("https?://[^%s%]]+%s([^%]]+)") or ""
end)
text = text
:gsub("\1", "[[")
:gsub("\2", "]]")
-- Any remaining square brackets aren't involved in links, but must be escaped to avoid creating new links.
text = text:gsub("[%[%]]", mw.text.nowiki)
-- Strip bold, italics and soft hyphens.
text = text
:gsub("('*)'''(.-'*)'''", "%1%2")
:gsub("('*)''(.-'*)''", "%1%2")
:gsub("", "")
-- Get any HTML entities.
-- Note: don't decode URL percent encoding, as it shouldn't be used in display text and may cause problems if % is used.
text = export.get_entities(text)
return mw.text.trim(text)
end
-- A helper function to return the content of a page section.
-- `content` is raw wikitext, `name` is the requested section, and `level` is an optional parameter that specifies the required section heading level. If `level` is not supplied, then the first section called `name` is returned.
-- `name` can either be a string or table of section names. If a table, each name represents a section that has the next as a subsection. For example, {"Spanish", "Noun"} will return the first matching section called "Noun" under a section called "Spanish". These do not have to be at adjacent levels ("Noun" might be L4, while "Spanish" is L2). If `level` is given, it refers to the last name in the table (i.e. the name of the section to be returned).
-- The returned section includes all of its subsections.
-- If no matching section is found, returns nil.
function export.get_section(content, names, level)
local trim = mw.text.trim
local function _section(content, name, level)
if not (content and name) then
return nil
elseif level and level > 6 then
error("Heading level cannot be greater than 6.")
elseif name:find("[\n\r]") then
error("Heading name cannot contain a newline.")
end
name = trim(name)
local start
for loc, lvl, sec in content:gmatch("()%f[^%z\n\r](=+)([^\n\r]+)%2[\t ]*%f[%z\n\r]") do
lvl = #lvl
if not start then
if lvl > 6 then
local ex = ("="):rep(lvl - 6)
sec = ex .. sec .. ex
lvl = 6
end
if (
(not level or lvl == level) and
trim(sec) == name
) then
start = loc
level = lvl
end
elseif level == 6 or lvl <= level then
return content:sub(start, loc - 1)
end
end
return start and content:sub(start)
end
if type(names) == "string" then
return _section(content, names, level)
else
local names_len = #names
if names_len > 6 then
error("Not possible specify more than 5 subsections: headings only go up to level 6.")
end
for i, name in ipairs(names) do
if i == names_len then
content = _section(content, name, level)
else
content = _section(content, name)
end
end
return content
end
end
do
local get_script = require("Módulo:scripts").getByCode
--[=[
Finds the best script for a string in a language-agnostic way.
Converts each character to a codepoint. Iterates the counter for the script code if the codepoint is in the list
of individual characters, or if it is in one of the defined ranges in the 4096-character block that it belongs to.
Each script has a two-part counter, for primary and secondary matches. Primary matches are when the script is the
first one listed; otherwise, it's a secondary match. When comparing scripts, first the total of both are compared
(i.e. the overall number of matches). If these are the same, the number of primary and then secondary matches are
used as tiebreakers. For example, this is used to ensure that `Grek` takes priority over `Polyt` if no characters
which exclusively match `Polyt` are found, as `Grek` is a subset of `Polyt`.
]=]
local function findBestScriptWithoutLang(text)
-- `scripts` contains counters for any scripts detected so far. Jpan and Kore are handled as special-cases, as they are combinations of other scripts.
local scripts_mt = {Jpan = true, Kore = true}
local weights_mt = {
__lt = function(a, b)
if a[1] + a[2] ~= b[1] + b[2] then
return a[1] + a[2] < b[1] + b[2]
elseif a[1] ~= b[1] then
return a[1] < b[1]
elseif a[2] ~= b[2] then
return a[2] < b[2]
else
return false
end
end
}
scripts_mt.__index = function(t, k)
local ret = {}
if k == "Jpan" and scripts_mt.Jpan then
for i = 1, 2 do
ret[i] = t["Hani"][i] + t["Hira"][i] + t["Kana"][i]
end
elseif k == "Kore" and scripts_mt.Kore then
for i = 1, 2 do
ret[i] = t["Hani"][i] + t["Hang"][i]
end
else
for i = 1, 2 do
table.insert(ret, 0)
end
end
return setmetatable(ret, weights_mt)
end
local scripts = setmetatable({}, scripts_mt)
text = export.get_plaintext(text)
local combined_scripts = {
Jpan = {["Hani"] = true, ["Hira"] = true, ["Kana"] = true},
Kore = {["Hani"] = true, ["Hang"] = true}
}
for character in text:gmatch("[%z\1-\127\194-\244][\128-\191]*") do
for i, script in ipairs(require("Módulo:scripts/charAScript").charAScript(character, true)) do
scripts[script] = scripts[script]
local weight = math.min(i, 2)
scripts[script][weight] = scripts[script][weight] + 1
end
end
-- Check the combined script counts. If a single constituent has the same count (i.e. it's the only one), discard the combined script.
for combined_script, set in pairs(combined_scripts) do
for script in pairs(set) do
scripts[combined_script] = scripts[combined_script]
if (scripts[script][1] + scripts[script][2]) == (scripts[combined_script][1] + scripts[combined_script][2]) then
scripts[combined_script] = nil
break
end
end
end
local bestScript
local greatestCount
for script, count in pairs(scripts) do
if (not greatestCount) or greatestCount < count then
bestScript = script
greatestCount = count
end
end
bestScript = bestScript or "None"
return get_script(bestScript)
end
local function findBestScriptWithLang(text, idioma, scripts)
-- Remove all formatting characters.
text = export.get_plaintext(text)
-- Try to match every script against the text,
-- and return the one with the most matching characters.
local bestcount, bestscript = 0
-- Remove any spacing or punctuation characters, and get resultant length.
-- Counting instances of UTF-8 character pattern is faster than mw.ustring.len.
local reducedText = mw.ustring.gsub(text, "[%s%p]+", "")
local _, length = string.gsub(reducedText, "[\1-\127\194-\244][\128-\191]*", "")
-- If the length is 0 then we're probably dealing with a punctuation character, so only remove spacing characters, in case it is script-specific.
if length == 0 then
reducedText = mw.ustring.gsub(text, "%s+", "")
_, length = string.gsub(reducedText, "[\1-\127\194-\244][\128-\191]*", "")
if length == 0 then
return get_script("None")
end
end
-- Ensure that "Hant", "Hans" and "Hani" are moved to the end of the list (in that order, if present), as they are a special-case.
local oldScripts, Hant, Hans, Hani, finalCheck = scripts, nil, nil, nil, nil
scripts = {}
for _, script in pairs(oldScripts) do
if script == "Hant" then
Hant = true
elseif script == "Hans" then
Hans = true
elseif script == "Hani" then
Hani = true
else
table.insert(scripts, get_script(script))
end
end
if Hant then table.insert(scripts, get_script("Hant")); finalCheck = true end
if Hans then table.insert(scripts, get_script("Hans")); finalCheck = true end
if Hani then table.insert(scripts, get_script("Hani")) end
for i, script in ipairs(scripts) do
local count = script:countCharacters(reducedText) -- Esta función cuenta sólo los caracteres que son parte de ese script, que es lo que necesitamos
-- Special case for "Hant", "Hans" and "Hani", which are returned if they match at least one character, under the assumption that (1) traditional and simplified characters will not be mixed if a language uses both scripts, and (2) any terms using Han characters with another script (e.g. Latin) will still need a Han code (not counting those which use Jpan or Kore). This is for efficiency, due to the special checks required for "Hant" and "Hans", and to prevent "Hani" from overriding either, as it will always match with at least as many characters, while characters used in both will only match with "Hani".
if count >= length or ((script._code == "Hant" or script._code == "Hans" or script._code == "Hani") and count > 0) then
return script
elseif count > bestcount then
bestcount = count
bestscript = script
end
end
-- Secondary check for languages that have "Hant" or "Hans" but not "Hani", but which still have multiple scripts (e.g. Macau Pidgin Portuguese): characters which are not exclusively traditional or simplified will not be found by the main check, so a separate "Hani" check is necessary to see if Han characters are present at all. If successful, return "Hant" or "Hans" as applicable.
if finalCheck and not Hani then
for _, script in ipairs(scripts) do
if (script._code == "Hant" or script._code == "Hans") and (get_script("Hani"):countCharacters(reducedText) > 0) then return script
end
end
end
if bestscript then
return bestscript
end
-- No matching script was found, so return "None".
return get_script("None")
end
function export.findBestScript(text, idioma)
if (not text) or text == "" or text == "-" then
return get_script("None")
end
if not idioma or idioma[4] == "All" then
return findBestScriptWithoutLang(text)
end
local scripts = {}
local i_ = 1
for script in idioma[4]:gmatch("([^,]+)%s*,?%s*") do
scripts[i_] = script
i_ = i_ + 1
end
if not scripts[2] and scripts[1] and scripts[1] ~= "" then
return get_script(scripts[1])
end
return findBestScriptWithLang(text, idioma, scripts)
end
end
return export