Módulo:generar-pron/ca
La documentación para este módulo puede ser creada en Módulo:generar-pron/ca/doc
local export = {}
local insert = table.insert
local concat = table.concat
local m_table = require("Módulo:tabla")
local listToSet = m_table.listToSet
local function concat_keys(tab)
local res = {}
for k, _ in pairs(tab) do
insert(res, k)
end
return concat(res)
end
local function concat_vals(tab)
local res = {}
for _, v in pairs(tab) do
insert(res, v)
end
return concat(res)
end
local m_str = require("Módulo:String")
local substr = m_str.sub
local strfind = m_str.find
local strmatch = m_str.match
local strsplit = m_str.split
local strsubn = m_str.gsub
local strsubb = m_str.gsubb
local strsubrep = m_str.gsub_rep
local strlower = m_str.lower
local strstrip = m_str.strip
local strmatchit = m_str.gmatch
local u = m_str.char
local strexplode = m_str.explode_utf8
local strnfc = m_str.toNFC
local strhtml = m_str.encode_html
-- Version of strsubn() that discards all but the first return value.
local function strsub(term, foo, bar)
local retval = strsubn(term, foo, bar)
return retval
end
local PUNTUACION = "[%(%)%[%]%{%}¡!¿?.,;:–—]"
local PUNTUACION_EXTRA = "[%(%)%[%]%{%}¡!¿?.,;:–—\"“”„‟‘’«»»«‹››‹]"
local BALEARICO = "bal"
local CENTRAL = "cen"
local VALENCIANO = "val"
local written_unaccented_vowel_l = "aeiouyAEIOUY"
local written_stressed_vowel_l = "àèéêëíòóôúýÀÈÉÊËÍÒÓÔÚÝ"
local written_accented_not_stressed_vowel_l = "ïüÏÜ"
local written_accented_vowel_l = written_stressed_vowel_l .. written_accented_not_stressed_vowel_l
local ipa_vowel_l = "ɔɛə"
local written_vowel_l = written_unaccented_vowel_l .. written_accented_vowel_l
local vowel_l = written_vowel_l .. ipa_vowel_l
local V = "[" .. vowel_l .. "]"
local written_accented_to_plain_vowel = {
["à"] = "a",
["è"] = "e",
["é"] = "e",
["ê"] = "e",
["ë"] = "e",
["í"] = "i",
["ï"] = "i",
["ò"] = "o",
["ó"] = "o",
["ô"] = "o",
["ú"] = "u",
["ü"] = "u",
["ý"] = "y",
["À"] = "A",
["È"] = "E",
["É"] = "E",
["Ê"] = "E",
["Ë"] = "E",
["Í"] = "I",
["Ï"] = "I",
["Ò"] = "O",
["Ó"] = "O",
["Ô"] = "O",
["Ú"] = "U",
["Ü"] = "U",
["Ý"] = "Y",
}
local AC = u(0x0301) -- acute = ́
local GR = u(0x0300) -- grave = ̀
local CFLEX = u(0x0302) -- circumflex = ̂
local DOTOVER = u(0x0307) -- dot over = ̇
local DIA = u(0x0308) -- diaeresis = ̈
local LINEUNDER = u(0x0331) -- lineunder = ̱
local stress_l = AC .. GR
local stress_c = "[" .. stress_l .. "]"
local ipa_stress_l = "ˈˌ"
local ipa_stress_c = "[" .. ipa_stress_l .. "]"
local sylsep_l = "%-."..ipa_stress_l -- hyphen included for syllabifying from spelling; FIXME: formerly included SYLDIV
local sylsep_c = "[" .. sylsep_l .. "]"
local tie_l = "‿'"
local tie_c = "[" .. tie_l .. "]"
local charsep_l = sylsep_l .. tie_l .. stress_l .. ipa_stress_l
local charsep_c = "[" .. charsep_l .. "]"
local wordsep_l = "# "
local wordsep_c = "[" .. wordsep_l .. "]"
local separator_l = charsep_l .. wordsep_l
local separator_c = "[" .. separator_l .. "]"
local neg_guts_of_cons = vowel_l .. separator_l
local C = "[^" .. neg_guts_of_cons .. "]" -- consonant class including h
export.mid_vowel_hints = "éèêëóòô"
export.mid_vowel_hint_c = "[" .. export.mid_vowel_hints .. "]"
local TEMP_PAREN_R = u(0xFFF1)
local TEMP_PAREN_RR = u(0xFFF2)
-- Pseudo-consonant at the edge of prefixes ending in a vowel and suffixes beginning with a vowel; FIXME: not currently
-- used.
local PSEUDOCONS = u(0xFFF3)
-- local PREFIX_MARKER = u(0xFFF4) -- marker indicating a prefix so we can convert primary to secondary accents
local valid_onsets = listToSet {
"b", "bl", "br",
"c", "cl", "cr",
"ç",
"d", "dj", "dr",
"f", "fl", "fr",
"g", "gl", "gr", "gu", "gü",
"h",
"i",
"j",
"k", "kl", "kr",
"l", "ll",
"m",
"n", "ny", "ñ",
"p", "pl", "pr",
"qu", "qü",
"r", "rr",
"s", "ss",
"t", "tg", "tj", "tr", "tx", "tz",
"u",
"v", "vl", "vr",
"w",
"x",
"ʃ", -- e.g. 'χruʃóf' respelling of [[Khrusxov]]
"χ", -- in case of respelling
"y",
"z",
}
local decompose_dotover = {
-- No composed i, u or U with DOTOVER.
["ȧ"] = "a" .. DOTOVER,
["ė"] = "e" .. DOTOVER,
["ȯ"] = "o" .. DOTOVER,
["ẏ"] = "y" .. DOTOVER,
["Ȧ"] = "A" .. DOTOVER,
["Ė"] = "E" .. DOTOVER,
["İ"] = "I" .. DOTOVER,
["Ȯ"] = "O" .. DOTOVER,
["Ẏ"] = "Y" .. DOTOVER,
}
local dotover_keys = concat_keys(decompose_dotover)
local unstressed_words = listToSet {
-- proclitic object pronouns
"em", "et", "es", "el", "la", "els", "les", "li", "ens", "us", "ho", "hi", "en",
-- enclitic object pronouns usually attach with hyphen to preceding verb but not always, cf. [[tant me fa]]
"me", "te", "se", "lo", "los", "nos", "vos", "ne",
-- contracted object pronouns and articles attached with apostrophe so no need to include
-- unstressed possessives
"mon", "ma", "mos", "mes", "ton", "ta", "tos", "tes", "son", "sa", "sos", "ses",
-- prepositions
"a", "de", "per", "amb", "ab", -- 'en' already included as proclitic object pronouns
-- prepositional contractions
"al", "als", "del", "dels", "pel", "pels",
-- articles 'el', 'la', 'els', 'les' already included as proclitic pronouns
-- personal articles
"na", -- 'en' already included above
-- indefinite articles
"un", "uns",
-- salat articles
"ets", "so", -- 'es' already included as proclitic object pronouns and 'ses', 'sa', 'sos' as possessives
-- conjunctions
"i", "o", "si", "ni", "que",
}
local pron_abc = {
["A"] = {"a"},
["a"] = {"a"},
["B"] = {"be", "be alta"},
["b"] = {"be", "be alta"},
["C"] = {"ce"},
["c"] = {"ce"},
["D"] = {"de"},
["d"] = {"de"},
["E"] = {"e"},
["e"] = {"e"},
["F"] = {"efa", "efe", "ef"},
["f"] = {"efa", "efe", "ef"},
["G"] = {"ge"},
["g"] = {"ge"},
["H"] = {"hac"},
["h"] = {"hac"},
["I"] = {"i"},
["i"] = {"i"},
["J"] = {"jota"},
["j"] = {"jota"},
["K"] = {"ca", "ka"},
["k"] = {"ca", "ka"},
["L"] = {"ela", "ele", "el"},
["l"] = {"ela", "ele", "el"},
["M"] = {"ema", "eme", "em"},
["m"] = {"ema", "eme", "em"},
["N"] = {"ena", "ene", "en"},
["n"] = {"ena", "ene", "en"},
["O"] = {"o"},
["o"] = {"o"},
["P"] = {"pe"},
["p"] = {"pe"},
["Q"] = {"cu"},
["q"] = {"cu"},
["R"] = {"erra", "erre", "er"},
["r"] = {"erra", "erre", "er"},
["S"] = {"essa", "esse", "es"},
["s"] = {"essa", "esse", "es"},
["T"] = {"te"},
["t"] = {"te"},
["U"] = {"u"},
["u"] = {"u"},
["V"] = {"ve"},
["v"] = {"ve"},
["W"] = {"doble ve", "ve doble"},
["w"] = {"doble ve", "ve doble"},
["X"] = {"ics", "xeix"},
["x"] = {"ics", "xeix"},
["Y"] = {"i grega"},
["y"] = {"i grega"},
["Z"] = {"zeta"},
["z"] = {"zeta"},
["Ç"] = {"ce trencada"},
["ç"] = {"ce trencada"},
["L·L"] = {"ela geminada", "ele geminada", "el geminada"},
["l·l"] = {"ela geminada", "ele geminada", "el geminada"},
}
local SUST = 1
local ADJ = 2
local VERB = 3
local ADV = 4
local normalizar_cg = {
["s"] = SUST,
["sust"] = SUST,
["sustantivo"] = SUST,
["adj"] = ADJ,
["adjetivo"] = ADJ,
["v"] = VERB,
["verb"] = VERB,
["verbo"] = VERB,
["adv"] = ADV,
["adverbio"] = ADV,
}
local function descomponer(text)
return strsub(text, "[" .. dotover_keys .. "]", decompose_dotover)
end
local function normalizar(texto)
texto = strlower(texto)
texto = descomponer(texto)
texto = strsubrep(texto, PUNTUACION, " | ") -- convierto lo que delimite fragmentos a los IPA foot boundaries |
texto = strsubrep(texto, PUNTUACION_EXTRA, "") -- elimino la puntuación restante que haya quedado
texto = strsubrep(texto, "[%-‐]", " ") --los guiones pasan a ser espacios (austro-húngaro, franco-italiano)
texto = strsubrep(texto, "%s*|%s*|%s*", " | ") --finalmente, elimino las barras y espacios de más
texto = strsubrep(texto, "%s+", " ")
texto = strstrip(texto, "[%s|]+")
return texto
end
local function handle_unstressed_words(words_)
local words = m_table.deepcopy(words_)
-- Check if the word at index `i` in `words` is "amb" and the following word begins with a vowel.
local function is_amb_to_join(words, i)
return i < #words and words[i] == "a" .. DOTOVER .. "mb" and strfind(words[i + 1], "^h?" .. V)
end
local saw_amb_to_join = true
-- Mark all unstressed words with DOTOVER, so that split_syllables() doesn't assign stress. We need to do this
-- before special handling for [[amb]], because [[amb]] may join to another unstressed word like [[el]], in the
-- process losing the identity of the two words. In the process, see if [[amb]] occurs before a following
-- vowel-initial word (which may begin with h-).
for i, word in ipairs(words) do
-- Put DOTOVER after the last vowel (to handle the case of [[que]]). It doesn't actually matter where we put
-- it, because split_syllables() just looks for DOTOVER anywhere in the word.
if unstressed_words[word] then
word = strsub(word, "^(.*" .. V .. ")", "%1" .. DOTOVER)
end
if is_amb_to_join(words, i) then
saw_amb_to_join = true
end
end
-- Join [[amb]] before vowel-initial word with following word.
if saw_amb_to_join then
local new_words = {}
local i = 1
while i <= #words do
if is_amb_to_join(words, i) then
insert(new_words, words[i] .. "‿" .. words[i + 1])
i = i + 2
else
insert(new_words, words[i])
i = i + 1
end
end
words = new_words
end
-- Finally, rewrite some unstressed words to get the right pronunciation. Any remaining [[amb]] not before a
-- vowel-initial word is pronounced [am] even in Valencian (where [amp]/[amb] would be expected), and [[per]] always
-- has a pronounced <r>.
local unstressed_word_replacement = {
["a" .. DOTOVER .. "mb"] = "a" .. DOTOVER .. "m",
["pe" .. DOTOVER .. "r"] = "pe" .. DOTOVER .. "rr",
}
for i, word in ipairs(words) do
word = unstressed_word_replacement[word] or word
end
return words
end
local function fix_prefixes(word)
-- Voiced s in prefix roots -fons-, -dins-, -trans-
word = strsub(word, "^enfons([aàeèéiíoòóuú])", "enfonz%1")
word = strsub(word, "^endins([aàeèéiíoòóuú])", "endinz%1")
word = strsub(word, "tr([aà])ns([aàeèéiíoòóuúbdghlmv])", "tr%1nz%2")
-- in + ex > ineks/inegz
word = strsub(word, "^inex", "in.ex")
return word
end
local function restore_diaereses(word)
-- Some structural forms do not have diaeresis per diacritic savings, let's restore it to identify hiatus
word = strsub(word, "([iu])um(s?)$", "%1üm%2") -- Latinisms (-ius is ambiguous but rare)
word = strsub(word, "([aeiou])isme(s?)$", "%1ísme%2") -- suffix -isme
word = strsub(word, "([aeiou])ist([ae]s?)$", "%1íst%2") -- suffix -ista
word = strsub(word, "([aeou])ir$", "%1ír") -- verbs -ir
word = strsub(word, "([aeou])int$", "%1ínt") -- present participle
word = strsub(word, "([aeo])ir([éà])$", "%1ïr%2") -- future
word = strsub(word, "([^gq]u)ir([éà])$", "%1ïr%2")
word = strsub(word, "([aeo])iràs$", "%1ïràs")
word = strsub(word, "([^gq]u)iràs$", "%1ïràs")
word = strsub(word, "([aeo])ir(e[mu])$", "%1ïr%2")
word = strsub(word, "([^gq]u)ir(e[mu])$", "%1ïr%2")
word = strsub(word, "([aeo])iran$", "%1ïran")
word = strsub(word, "([^gq]u)iran$", "%1ïran")
word = strsub(word, "([aeo])iria$", "%1ïria") -- conditional
word = strsub(word, "([^gq]u)iria$", "%1ïria")
word = strsub(word, "([aeo])ir(ie[sn])$", "%1ïr%2")
word = strsub(word, "([^gq]u)ir(ie[sn])$", "%1ïr%2")
return word
end
local function fix_y(word)
-- y > vowel i else consonant /j/, except ny
word = strsub(word, "ny", "ñ")
word = strsub(word, "y([^aeiouàèéêëíòóôúïü])", "i%1") -- vowel if not next to another vowel
word = strsub(word, "([^aeiouàèéêëíòóôúïü·%-%.])y", "%1i") -- excluding also syllables separators
return word
end
local function mid_vowel_fixes(word)
local function track_mid_vowel(vowel, cont)
require("Módulo:traza")("ca-vocales")
return true
end
local changed
-- final -el (not -ell) usually è but not too many cases
word, changed = strsubb(word, "e(nts?)$", "é%1")
if changed then
track_mid_vowel("e", "nt-nts")
end
word, changed = strsubb(word, "e(rs?)$", "é%1")
if changed then
track_mid_vowel("e", "r-rs")
end
word, changed = strsubb(word, "o(rs?)$", "ó%1")
if changed then
track_mid_vowel("o", "r-rs")
end
word, changed = strsubb(word, "è(s?)$", "ê%1")
if changed then
track_mid_vowel("è", "s-blank")
end
word, changed = strsubb(word, "e(s[oe]s)$", "ê%1")
if changed then
track_mid_vowel("e", "sos-sa-ses")
end
word, changed = strsubb(word, "e(sa)$", "ê%1")
if changed then
track_mid_vowel("e", "sos-sa-ses")
end
return word
end
local function word_fixes(word, dialect)
word = strsub(word, "%(rr%)", TEMP_PAREN_RR)
word = strsub(word, "%(r%)", TEMP_PAREN_R)
word = strsub(word, "%-([rs]?)", "-%1%1")
if dialect == VALENCIANO then
word = strsub(word, "%-x", "-tx")
end
word = strsub(word, "rç$", "rrs") -- silent r only in plurals -rs
word = fix_prefixes(word) -- internal pause after a prefix
word = restore_diaereses(word) -- no diaeresis saving
word = fix_y(word) -- ny > ñ else y > i vowel or consonant
word = mid_vowel_fixes(word)
-- all words in pn- (e.g. [[pneumotòrax]] and mn- (e.g. [[mnemònic]]) have silent p/m in both Central and Valencian
word = strsub(word, "^[pm]n", "n")
-- Respell ch + vowel as tx, before we remove other h's after consonants.
word = strsub(word, "ch(" .. V ..")", "tx%1")
-- Delete h after a consonant. This must happen here, before split_syllables(). We don't delete h after a vowel
-- yet because it indicates a hiatus.
word = strsub(word, "(" .. C .. ")h", "%1")
return word
end
local function split_vowels(vowels, saw_dotover, saw_lineunder)
local syllables = {{onset = "", vowel = substr(vowels, 1, 1), coda = "", separator = "", has_dotover = saw_dotover,
has_lineunder = saw_lineunder}}
vowels = substr(vowels, 2)
while vowels ~= "" do
local syll = {onset = "", vowel = "", coda = ""}
syll.onset, syll.vowel, vowels = strmatch(vowels, "^([iu]?)(.)(.-)$")
insert(syllables, syll)
end
local count = #syllables
if count >= 2 and (syllables[count].vowel == "i" or syllables[count].vowel == "u") then
syllables[count - 1].coda = syllables[count].vowel
syllables[count] = nil
end
return syllables
end
-- Split the word into syllables. Return a list of syllable objects, each of which contains fields `onset`, `vowel`,
-- `coda`, `separator` (a user-specified syllable divider that goes before the syllable; one of '·', '-' or '.') and
-- `stressed` (a boolean indicating that the syllable is stressed). In addition, the list has fields `stress` (the
-- index of the syllable with primary stress) and `is_prefix` (true if the word is a prefix, i.e. it ends in '-').
-- Normally, prefixes are treated as unstressed if a stressed syllable isn't explicitly marked, but this can be
-- overridden with `stress_prefixes`, which causes the automatic stress-assignment algorithm to run for these terms.
local function split_syllables(word, stress_prefixes, may_be_uppercase)
local syllables = {}
local saw_dotover = false
local remainder = word
local is_prefix = false
if remainder:find("%-$") then -- prefix
is_prefix = true
remainder = remainder:gsub("%-$", "")
end
local is_suffix = false
if remainder:find("^%-") then -- suffix
is_suffix = true
remainder = remainder:gsub("^%-", "")
end
while remainder ~= "" do
local consonants, vowels
-- FIXME: Using C and V below instead of the existing patterns slows things down TREMENDOUSLY.
-- Not sure why.
local vowel_list = may_be_uppercase and "aeiouàèéêëíòóôúïüAEIOUÀÈÉÊËÍÒÓÔÚÏÜ" .. DOTOVER .. LINEUNDER or
"aeiouàèéêëíòóôúïü" .. DOTOVER .. LINEUNDER
consonants, remainder = strmatch(remainder, "^([^" .. vowel_list .. "]*)(.-)$")
vowels, remainder = strmatch(remainder, "^([" .. vowel_list .. "]*)(.-)$")
local this_saw_dotover = not not strfind(vowels, DOTOVER)
if this_saw_dotover then
saw_dotover = true
vowels = vowels:gsub(DOTOVER, "")
end
local this_saw_lineunder = not not strfind(vowels, LINEUNDER)
if this_saw_lineunder then
vowels = vowels:gsub(LINEUNDER, "")
end
if vowels == "" then
if #syllables > 0 then
syllables[#syllables].coda = syllables[#syllables].coda .. consonants
else
-- word without vowels, e.g. foot boundary |
insert(syllables, {onset = consonants, vowel = "", coda = "", separator = ""})
end
else
local onset = consonants
local first_vowel = substr(vowels, 1, 1)
if (strfind(onset, "[gqGQ]$") and (first_vowel == "ü" or (first_vowel == "u" and vowels ~= "u")))
or ((onset == "" or onset == "h" or onset == "H") and #syllables == 0 and
(first_vowel == "i" or first_vowel == "I") and (vowels ~= "i" and vowels ~= "I"))
then
onset = onset .. substr(vowels, 1, 1)
vowels = substr(vowels, 2)
end
local vsyllables = split_vowels(vowels, this_saw_dotover, this_saw_lineunder)
vsyllables[1].onset = onset .. vsyllables[1].onset
for _, s in ipairs(vsyllables) do
insert(syllables, s)
end
end
end
-- Shift over consonants from the onset to the preceding coda, until the syllable onset is valid
for i = 2, #syllables do
local current = syllables[i]
local previous = syllables[i-1]
while not (current.onset == "" or valid_onsets[strsub(strsub(current.onset, tie_c .. "[hH]?$", ""), "_", "")]) do
local letter = substr(current.onset, 1, 1)
current.onset = substr(current.onset, 2)
if strfind(letter, "[·%-%.]") then -- syllable separators
current.separator = letter
break
else
previous.coda = previous.coda .. letter
if strfind(letter, tie_c) then
break
end
end
end
end
-- Detect stress
for i, syll in ipairs(syllables) do
if strfind(syll.vowel, "^[" .. written_stressed_vowel_l .. "]$") then
syll.stressed = true
-- primary stress: the last one stressed without LINEUNDER
if not syll.has_lineunder then
syllables.stress = i
end
end
end
-- Assign default stress
if not syllables.stress and not saw_dotover and (stress_prefixes or not is_prefix) then
local count = #syllables
if count == 1 then
if syllables[1].vowel ~= "" then -- vowel-less words don't get stress
syllables.stress = 1
end
else
local final = syllables[count]
-- Take account of tie symbols (apostrophes and ‿).
if strfind(final.coda, "^[s" .. tie_l .. "]*$") or (strfind(final.coda, "^" .. tie_c .. "*n" .. tie_c .. "*$") and (
final.vowel == "e" or final.vowel == "i" or final.vowel == "ï")) then
syllables.stress = count - 1
else
syllables.stress = count
end
end
if syllables.stress then
syllables[syllables.stress].stressed = true
end
end
syllables.is_prefix = is_prefix
syllables.is_suffix = is_suffix
return syllables
end
local IPA_vowels_central = {
["ê"] = "ɛ", ["ë"] = "ɛ", ["ô"] = "ɔ",
}
local IPA_vowels_balearic = {
["ê"] = "ə", ["ë"] = "ɛ", ["ô"] = "ɔ",
}
local IPA_vowels_valencian = {
["ê"] = "e", ["ë"] = "e", ["ô"] = "o",
}
local IPA_vowels = {
["à"] = "a",
["è"] = "ɛ", ["ê"] = "ɛ", ["ë"] = "ɛ", ["é"] = "e",
["í"] = "i", ["ï"] = "i",
["ò"] = "ɔ", ["ô"] = "ɔ", ["ó"] = "o",
["ú"] = "u", ["ü"] = "u",
}
local IPA_VOWEL_CLUSTER = "[" .. concat_vals(IPA_vowels) .. "]"
local function replace_context_free(cons)
cons = strsub(cons, "ŀ", "l")
cons = strsub(cons, "r", "ɾ")
cons = strsub(cons, "ɾɾ", "r")
cons = strsub(cons, "ss", "s")
cons = strsub(cons, "ll", "ʎ")
cons = strsub(cons, "ñ", "ɲ") -- hint ny > ñ
-- NOTE: We use single-character affricate symbols during processing for ease in handling, and convert them
-- to tied multi-character affricates at the end of join_syllables().
cons = strsub(cons, "[dt]j", "ʤ")
cons = strsub(cons, "tx", "ʧ")
cons = strsub(cons, "[dt]z", "ʣ")
cons = strsub(cons, "ç", "s")
cons = strsub(cons, "[cq]", "k")
cons = strsub(cons, "h", "")
cons = strsub(cons, "j", "ʒ")
-- Don't replace x -> ʃ yet so we can distinguish x from manually specified ʃ.
cons = strsub(cons, "i", "j") -- must be after j > ʒ
cons = strsub(cons, "y", "j") -- must be after j > ʒ and fix_y
cons = strsub(cons, "[uü]", "w")
cons = strsub(cons, "'", "‿")
return cons
end
-- Do context-sensitive phonological changes. Formerly this was all done syllable-by-syllable but that made the code
-- tricky (since it often had to look at adjacent syllables) and full of subtle bugs. Now we first concatenate the
-- syllables back to words and the words to the combined text and work on the text as a whole. FIXME: We should move
-- more of the work done in preprocess_word(), e.g. most of replace_context_free(), here.
local function postprocess_general(text, dialect)
local voiced = listToSet({"b", "d", "g", "m", "n", "ɲ", "l", "ʎ", "r", "ɾ", "v", "z", "ʒ", "ʣ", "ʤ"})
--local voiced_keys = concat_keys(voiced)
local voiceless = listToSet({"p", "t", "k", "f", "s", "ʃ", "ʦ", "ʧ"})
--local voiceless_keys = concat_keys(voiceless)
local voicing = {["p"] = "b", ["t"] = "d", ["k"] = "g", ["f"] = "v", ["s"] = "z", ["ʃ"] = "ʒ", ["ʦ"] = "ʤ",
["ʧ"] = "ʤ"}
--local voicing_keys = concat_keys(voicing)
local devoicing = {}
for k, v in pairs(voicing) do
devoicing[v] = k
end
--local devoicing_keys = concat_keys(devoicing)
------------------ Handle <x>
-- Handle ex- + vowel > -egz-. We handle -x- on either side of the syllable boundary. Note that this also handles
-- inex- + vowel because in fix_prefixes we respell inex- as in.ex-, which ends up at this stage as in.e.xV.
text = strsubrep(text, "([.#][eɛ]" .. stress_c .. "*)(" .. charsep_c .. "*)x(" .. charsep_c .. "*" .. V ..
")", function(e, pre, post)
-- Preserve other character separators (especially the tie character ‿).
pre = pre:gsub("%.", "")
post = post:gsub("%.", "")
return e .. pre .. "g.z" .. post
end)
-- -x- at the beginning of a coda becomes [ks], e.g. [[annex]], [[apèndix]], [[extracció]]; but not elsewhere in
-- the coda, e.g. in [[romanx]], [[ponx]]; words with [ks] in -nx such as [[esfinx]], [[linx]], [[manx]] need
-- respelling with [ks]; words ending in vowel + x like [[ídix]] need respelling with [ʃ]
text = strsub(text, "(" .. V .. stress_c .. "*)x", "%1ks")
if dialect == VALENCIANO then
-- Word-initial <x> as well as <x> after a consonant other than /j/ (including in the coda, e.g. [[ponx]])
-- becomes [t͡ʃ].
text = strsub(text, "#x", "#ʧ")
text = strsub(text, "([^" .. vowel_l .. separator_l .. "j]" .. charsep_c .. "*)x", "%1ʧ")
end
-- Other x becomes [ʃ]
text = strsub(text, "x", "ʃ")
-- Doubled ss -> s e.g. in exs-, exc(e/i)-, sc(e/i)-; FIXME: should this apply across word boundaries?
text = strsub(text, "s(" .. charsep_c .. "*)s", "%1s")
------------------ Coda consonant losses
-- In Central Catalan, coda losses happen everywhere, but otherwise they don't happen when
-- absolutely word-finally before a vowel or end of utterance (e.g. [[blanc]] has /k/ in Balearic and
-- Valencian but not [[blancs]]). Must precede consonant assimilations.
local boundary = dialect == CENTRAL and "(.)" or "([^#])"
text = strsub(text, "m[pb]" .. boundary, "m%1")
text = strsub(text, "([ln])[td]" .. boundary, "%1%2")
text = strsub(text, "[nŋ][kg]" .. boundary, "ŋ%1")
if dialect == VALENCIANO or dialect == BALEARICO then
local before_cons = "(" .. separator_c .. "*" .. C .. ")"
text = strsub(text, "m[pb]" .. before_cons, "m%1")
text = strsub(text, "([ln])[td]" .. before_cons, "%1%2")
text = strsub(text, "[nŋ][kg]" .. before_cons, "ŋ%1")
end
-- Delete /t/ between /s/ and any consonant other than /s/ or /ɾ/. Must precede voicing assimilation and
-- t + lateral/nasal assimilation.
text = strsub(text, "st(" .. sylsep_c .. "*[^" .. neg_guts_of_cons .. "sɾ])", "s%1")
------------------ Consonant assimilations
if dialect == CENTRAL then
-- v > b in onsets (not in codas, e.g. [[ovni]] [ɔ́vni] and [[hafni]] [ávni]). This needs to precede
-- assimilation of nb -> mb.
text = strsub(text, "v(" .. C .. "*" .. V ..")", "b%1")
end
-- t + lateral assimilation -> geminate across syllable boundary. We don't any more do t + nasal assimiation
-- because there are too many exceptions, e.g. [[aritmètic]], [[atmosfèric]], [[ètnia]]. Instead, we require that
-- cases where it does happen use respelling to effect this. FIXME: this doesn't always happen in -tl- either,
-- e.g. [[atlàntic]] has [əllántik] in GDLC but [adlántik] in DNV.
--
-- FIXME: Clean this up, maybe move below voicing assimilation, investigate whether it operates across words,
-- move stuff below that special-cases tll in Valencian here.
text = strsub(text, "t(" .. sylsep_c .. ")([lʎ])", "%2%1%2")
-- n + labial > labialized assimilation
text = strsub(text, "n(" .. separator_c .. "*[mbp])", "m%1")
text = strsub(text, "n(" .. separator_c .. "*[fv])", "ɱ%1")
-- n + velar > velarized assimilation
text = strsub(text, "n(" .. separator_c .. "*[kg])", "ŋ%1")
-- l/n + palatal > palatalized assimilation
text = strsub(text, "([ln])(" .. separator_c .. "*[ʎɲʃʒʧʤ])", function(ln, palatal)
ln = ({["l"] = "ʎ", ["n"] = "ɲ"})[ln]
return ln .. palatal
end)
-- ɲs > ɲʃ; FIXME: not sure the purpose of this; it doesn't apply in [[menys]] or derived terms like [[menyspreu]]
-- NOTE: Per [https://giec.iec.cat/textgramatica/codi/4.4], it does apply in these scenarios but the result is
-- somewhere between [s] and [ʃ], which is why it isn't shown in GDLC.
-- text = strsub(text, "ɲs", "%1ʃ")
------------------ Handle <r>
-- In replace_context_free(), we converted single r to ɾ and double rr to r.
if dialect == CENTRAL then
text = strsub(text, TEMP_PAREN_R, "")
text = strsub(text, TEMP_PAREN_RR, "r")
elseif dialect == BALEARICO then
text = strsub(text, TEMP_PAREN_R, "")
text = strsub(text, TEMP_PAREN_RR, "")
else
assert(dialect == VALENCIANO, ("Unrecognized dialect '%s'"):format(dialect))
text = strsub(text, TEMP_PAREN_R, "ɾ")
text = strsub(text, TEMP_PAREN_RR, "ɾ")
end
if dialect ~= VALENCIANO then
-- Coda /ɾ/ -> /r/
-- FIXME: This is inherited from the older code. Correct?
text = strsub(text, "(" .. V .. stress_c .. "*" .. C .. "*)ɾ", "%1r")
end
-- ɾ -> r word-initially or after [lns]; needs to precede voicing assimilation as <s> will be voiced to [z] before
-- /ɾ/.
text = strsub(text, "([#lns]" .. sylsep_c .. "*)ɾ", "%1r")
------------------ Voicing assimilation
-- Voicing or devoicing; we want to proceed from right to left, and due to the limitations of patterns (in
-- particular, the lack of support for alternations), it's difficult to do this cleanly using Lua patterns, so we
-- do it character by character.
local chars = strexplode(text)
-- We need to look two characters ahead in some cases, so start two characters from the end. This is safe because
-- the overall respelling ends in "##". (Similarly, as an optimization, don't check the first two characters, which
-- are always "##".)
for i = #chars - 2, 3, -1 do
-- We are looking for two consonants next to each other, possibly separated by a syllable or word divider.
-- We also handle a consonant followed by a syllable divider then a vowel, and a consonant word-finally.
-- Note that only coda consonants can change voicing, so we need to check to make sure we're in the coda.
local first = chars[i]
-- If `second` is nil, no assimilation occurs. Otherwise, `second` should be a consonant or empty string (which
-- represents a syllable or word boundary followed by a vowel or end of string), and we assimilate to that
-- consonant (empty string forces devoicing).
local second
-- If set to true, we're processing a consonant directly before a word boundary followed by a word beginning
-- with a vowel. In this context, voiceless sibilants voice. Note that we handle voicing of <s> word-internally
-- separately, in preprocess_word() [FIXME: maybe move much of the processing in preprocess_word() into this
-- function].
local word_boundary_before_vowel
if not strfind(first, C) then
-- leave `second` at nil; no assimilation
elseif chars[i + 1] == "#" then -- word boundary
if chars[i + 2] == " " then
-- chars[i + 3] should always be "#"
assert(chars[i + 3] == "#", "Word boundary followed by space but not #")
if strfind(chars[i + 4], C) then
second = chars[i + 4]
else
second = ""
word_boundary_before_vowel = true
end
else
second = ""
end
elseif strfind(chars[i + 1], sylsep_c) then -- syllable boundary
if strfind(chars[i + 2], C) then
second = chars[i + 2]
else
second = ""
end
elseif strfind(chars[i + 1], C) then
second = chars[i + 1]
else
-- followed by a vowel not across a syllable or word boundary; leave `second` as nil, no assimilation
end
if second then
-- Make sure we're in the coda. We have to look backwards until we find a vowel or syllable/word boundary.
local in_coda = false
local j = i - 1
while true do
assert(j > 0, "Missing word boundary at beginning of overall respelling")
if strfind(chars[j], "[" .. sylsep_l .. wordsep_l .. "]") then
break
elseif strfind(chars[j], V) then
in_coda = true
break
end
j = j - 1
end
if in_coda then
if word_boundary_before_vowel and strfind(first, "[zʒʣʤ]") then
-- leave alone
elseif voiced[second] and voicing[first] or word_boundary_before_vowel and strfind(first, "[sʃʦʧ]") then
chars[i] = voicing[first]
elseif (voiceless[second] or second == "") and devoicing[first] then
chars[i] = devoicing[first]
end
end
end
end
text = concat(chars)
-- gn -> ŋn e.g. [[regnar]] (including word-initial gn- e.g. [[gnòmic]], [[gneis]])
-- FIXME: This should be moved below voicing assimilation, and we need to investigate if it operates across words
-- (here I'm guessing yes).
if dialect ~= CENTRAL then
text = strsub(text, "#gn", "#n")
end
text = strsub(text, "g(" .. separator_c .. "*n)", "ŋ%1")
-- gʒ > d͡ʒ
-- FIXME: We need to investigate if it operates across words
text = strsub(text, "g(" .. sylsep_c .. "*)ʒ", "%1ʤ")
-- sʃ -> ʃ ([[desxifrar]]), zʒ -> ʒ ([[disjuntor]])
if dialect ~= VALENCIANO then
text = strsub(text, "s(" .. separator_c .. "*ʃ)", "%1")
text = strsub(text, "z(" .. separator_c .. "*ʒ)", "%1")
end
------------------ Gemination of <bl>, <gl>
if dialect ~= VALENCIANO then
-- bl -> bbl, gl -> ggl after the stress when following a vowel; to avoid this, use <b_l> or <g_l>.
-- This must follow v > b above. To force a hard ungeminated [b] or [g], use <_b> or <_g>.
text = strsub(text, "(" .. stress_c .. ")(" .. sylsep_c .. ")([bg])l", "%1%3%2%3l")
else -- Valencian; undo manually written 'bbl', 'ggl' in words like [[poblar]], [[reglament]]
text = strsub(text, "([bg])(" .. sylsep_c .. ")%1l", "%2%1l")
end
------------------ Lenition of voiced stops
-- In Central Catalan, b/d/g become fricatives (actually approximants, like in Spanish) in the onset following a
-- vowel and (except for <d>) after <l> and <ll> (cf. GDLC [[cabellblanc]] [kəβɛ̀ʎβláŋ]). This also happens across
-- word boundaries but doesn't happen after stops, nor in Central Catalan after [r], [ɾ] or [z] (and hence probably
-- not after [ʒ] either, although I can't find any examples in GDLC).
--
-- In Valencian, <b> doesn't lenite (at least formally?), but <d> and <g> do lenite after [r], [ɾ] or [z].
--
-- Balearic is like Valencian in not leniting <b>, and probably like Central Catalan otherwise.
local lenite_bdg = {["b"] = "β", ["d"] = "ð", ["g"] = "ɣ"}
if dialect == CENTRAL then
text = strsub(text, "([" .. vowel_l .. "jwlʎv]" .. separator_c .. "*[.#]" .. separator_c .. "*)([bdg])",
function(before, bdg) return before .. lenite_bdg[bdg] end)
elseif dialect == VALENCIANO then
text = strsub(text, "([" .. vowel_l .. "jwlʎvrɾzʣ]" .. separator_c .. "*[.#]" .. separator_c .. "*)([dg])",
function(before, dg) return before .. lenite_bdg[dg] end)
else
assert(dialect == BALEARICO, ("Unrecognized dialect '%s'"):format(dialect))
text = strsub(text, "([" .. vowel_l .. "jwlʎv]" .. separator_c .. "*[.#]" .. separator_c .. "*)([dg])",
function(before, dg) return before .. lenite_bdg[dg] end)
end
------------------ Vowel reduction
-- Reduction of unstressed a,e in Central and Balearic (Eastern Catalan).
if dialect ~= VALENCIANO then
-- The following rules seem to apply, based on the old code:
-- (1) Stressed a and e are never reduced.
-- (2) Unstressed e directly following ə is not reduced.
-- (3) Unstressed e directly before written <a> or before /ɔ/ is not reduced.
-- (4) Written <ee> when both vowels precede the primary stress is reduced to [əə]. (This rule preempts #2.)
-- (5) Written <ee> when both vowels follow the primary stress isn't reduced at all.
-- Rule #2 in particular seems to require that we proceed left to right, which is how the old code was
-- implemented.
-- FIXME: These rules seem overly complex and may produce incorrect results in some circumstances.
local words = strsplit(text, " ")
for j, word in ipairs(words) do
local chars = strexplode(word)
-- See above where voicing assimilation is handled. The overall respelling begins and ends in #, which we
-- can ignore. We need to look ahead three chars in some circumstances, but in all those circumstances we
-- shoudn't run off the end (and have assertions to check this).
local seen_primary_stress = false
for i = 2, #chars - 1 do
local this = chars[i]
if chars[i] == AC then
seen_primary_stress = true
end
if (this ~= "a" and this ~= "e") or strfind(chars[i + 1], stress_c) then
-- Not a/e, or a stressed vowel; continue
else
local reduction = true
local prev, prev_stress, nxt, nxt_stress
if not strfind(chars[i - 1], sylsep_c) then
prev = ""
else
prev = chars[i - 2] -- this should be non-nil as chars[i - 1] is a syllable separator (not #)
assert(prev, "Missing # at word boundary")
prev_stress = ""
if strfind(prev, stress_c) then
prev_stress = prev
prev = chars[i - 3]
-- As above; chars[i - 2] is a stress indicator (not #).
assert(prev, "Missing # at word boundary")
end
end
if not strfind(chars[i + 1], sylsep_c) then
nxt = ""
-- leave nxt at nil
else
nxt = chars[i + 2]
nxt_stress = chars[i + 3]
-- chars[i + 1] is a syllable separator, so chars[i + 2] should not be a word boundary, so
-- chars[i + 3] should exist.
assert(nxt and nxt_stress, "Syllable separator at word boundary or missing # at word boundary")
end
if this == "e" and strfind(prev, "ə") then
reduction = false
elseif this == "e" and strfind(nxt, "[aɔ]") then
reduction = false
elseif this == "e" and nxt == "e" and not strfind(nxt_stress, AC) then
-- FIXME: Check specifically for AC duplicates previous logic but is probably wrong or unnecessary.
if not seen_primary_stress then
chars[i + 2] = "ə"
else
reduction = false
end
end
if reduction then
chars[i] = "ə"
end
end
end
words[j] = concat(chars)
end
text = concat(words, " ")
end
if dialect == CENTRAL then
-- Reduction of unstressed o (not before w)
text = strsub(text, "o([^" .. stress_l .. "w])", "u%1")
elseif dialect == BALEARICO then
-- Reduction of unstressed o per vowel harmony: unstressed /o/ -> /u/ directly before stressed /i/ or /u/;
-- as a Lua pattern, o can be followed only by consonants and/or syllable separators (no vowels, stress marks
-- or word separators).
text = strsub(text, "o([^" .. vowel_l .. stress_l .. wordsep_l .. "]*[iu]" .. stress_c .. ")", "u%1")
end
-- Final losses.
text = strsub(text, "j(ʧs?#)", "%1") -- boigs /bɔt͡ʃ/
text = strsub(text, "([ʃʧs])s#", "%1#") -- homophone plurals -xs, -igs, -çs
if dialect ~= VALENCIANO then
-- Remove j before palatal obstruents
text = strsub(text, "j(" .. sylsep_c .. "*[ʃʒʧʤ])", "%1")
else -- Valencian
-- Fortition of palatal fricatives
text = strsub(text, "ʒ", "ʤ")
text = strsub(text, "(i" .. stress_c .. "*" .. sylsep_c .. ")ʣ", "%1z")
end
if dialect ~= CENTRAL then
-- No palatal gemination ʎʎ > ll or ʎ, in Valencian and Balearic.
-- FIXME: These conditions seem to be targeting specific words and should probably be fixed using respelling
-- instead.
text = strsub(text, "([bpw]a" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = strsub(text, "([mv]e" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = strsub(text, "(ti" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = strsub(text, "(m[oɔ]" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = strsub(text, "(u" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = strsub(text, "ʎ(" .. sylsep_c .. "*ʎ)", "%1")
end
---------- Convert pseudo-symbols to real ones.
-- Convert g to IPA ɡ.
text = strsub(text, "g", "ɡ")
-- Convert pseudo-afficate symbols to full affricates.
local full_affricates = { ["ʦ"] = "t͡s", ["ʣ"] = "d͡z", ["ʧ"] = "t͡ʃ", ["ʤ"] = "d͡ʒ" }
text = strsub(text, "([ʦʣʧʤ])", full_affricates)
---------- Generate IPA stress marks.
-- Convert acute and grave to IPA stress marks.
text = strsub(text, AC, "ˈ")
text = strsub(text, GR, "ˌ")
-- Move IPA stress marks to the beginning of the syllable.
text = strsubrep(text, "([#.])([^#.]*)(" .. ipa_stress_c .. ")", "%1%3%2")
-- Suppress syllable divider before IPA stress indicator.
text = strsub(text, "%.(#?" .. ipa_stress_c .. ")", "%1")
-- Make all primary stresses but the last one in a given word be secondary. May be fed by the first rule above.
-- FIXME: Currently this is handled earlier, but we might want to move it here, as is done in [[Module:pt-pronunc]].
-- text = strsubrep(text, "ˈ([^ ]+)ˈ", "ˌ%1ˈ")
-- Make primary stresses in prefixes become secondary. (FIXME: Handled earlier now.)
-- text = strsubrep(text, "ˈ([^#]*#" .. PREFIX_MARKER .. ")", "ˌ%1")
-- Remove # symbols at word/text boundaries, as well as _ (which forces separate interpretation), pseudo-consonant
-- markers (at edges of some prefixes/suffixes), and prefix markers, and recompose.
text = strsub(text, "[#_" .. PSEUDOCONS .. "]", "")
text = strnfc(text)
return text
end
local function preprocess_word(syllables, suffix_syllables, dialect, pos, orig_word)
-- Stressed vowel is ambiguous
if syllables.stress then
if strfind(syllables[syllables.stress].vowel, "[eo]") then
--[[
local marks = {["e"] = {AC, GR, CFLEX, DIA}, ["o"] = {AC, GR, CFLEX}}
local marked_vowels = {}
for _, mark in ipairs(marks[stressed_vowel]) do
insert(marked_vowels, stressed_vowel .. mark)
end
--error(("In respelling '%s', the stressed vowel '%s' is ambiguous. Please mark it with an acute, " ..
--"grave, or combined accent: %s."):format(orig_word, stressed_vowel,
--m_table.serialCommaJoin(marked_vowels, {dontTag = true, conj = "or"})))
--]]
-- en lugar de arrojar error, asumo posición cerrada que es lo más frecuente (no hay regla para decidir, lo va a tener q especificar el usuario)
syllables[syllables.stress].vowel = strsub(syllables[syllables.stress].vowel, "[eo]", {["e"] = "é", ["o"] = "ó"})
end
end
-- Final -r is ambiguous in many cases.
local final = syllables[#syllables]
-- Stressed final r after a or i in non-monosyllables is treated as (r), i.e. verbal infinitives are assumed (NOTE:
-- not always the case, e.g. there are many adjectives and nouns in -ar that should be marked as '(rr)', and
-- several loanword nouns in -ir that should be marked as 'rr'). Likewise for stressed final r or rs after é in
-- non-monosyllables (which are usually adjectives or nouns with the -er ending, but may be verbal infinitives,
-- which should be marked as 'ê(r)'). That is, it disappears other than in Valencian. All other final r and final
-- rs are considered ambiguous and need to be rewritten using rr, (rr) or (r).
if #syllables > 1 and final.stressed then
if final.coda == "r" and strfind(final.vowel, "[aàiíé]") or final.coda == "rs" and final.vowel == "é" or
final.vowel == "ó" and strfind(final.coda, "^rs?$") and strfind(final.onset, "[stdç]") then
final.coda = TEMP_PAREN_R
end
end
if strfind(final.coda, "^rs?$") or strfind(final.coda, "[^r]rs?$") then
--[[
error(("In respelling '%s', final -r by itself or in -rs is ambiguous except in the verbal endings -ar or " ..
"-ir, in the nominal or adjectival endings -er(s) and -[dtsç]or(s). In all other cases it needs to be " ..
"rewritten using one of 'rr' (pronounced everywhere), '(rr)' (pronounced everywhere but Balearic) or " ..
"'(r)' (pronounced only in Valencian). Note that adjectives in -ar usually need rewriting using '(rr)'; " ..
"nouns in -ar referring to places should be rewritten using '(r)'; and loanword nouns in -ir usually " ..
"need rewriting using 'rr'."):format(orig_word))
]]--
final.coda = strsub(final.coda, "r(s?)$", TEMP_PAREN_R.."%1") -- asumo que es (r)
end
local syllables_IPA = {stress = syllables.stress, is_prefix = syllables.is_prefix, is_suffix = syllables.is_suffix}
for key, val in ipairs(syllables) do
syllables_IPA[key] = {onset = val.onset, vowel = val.vowel, coda = val.coda, stressed = val.stressed}
end
-- Replace letters with IPA equivalents
for i, syll in ipairs(syllables_IPA) do
-- Voicing of s
if syll.onset == "s" and i > 1 and strfind(syllables[i - 1].coda, "^[iu]?$") then
syll.onset = "z"
end
if strfind(syll.vowel, "^[eèéêëií]$") then
syll.onset = strsub(syll.onset, "tg$", "ʤ")
syll.onset = strsub(syll.onset, "[cg]$", {["c"] = "s", ["g"] = "ʒ"})
syll.onset = strsub(syll.onset, "[qg]u$", {["qu"] = "k", ["gu"] = "g"})
end
syll.coda = strsub(syll.coda, "igs?$", "iʤ")
syll.onset = replace_context_free(syll.onset)
syll.coda = replace_context_free(syll.coda)
syll.vowel = strsub(syll.vowel, ".",
dialect == CENTRAL and IPA_vowels_central or
dialect == BALEARICO and IPA_vowels_balearic or
IPA_vowels_valencian
)
syll.vowel = strsub(syll.vowel, ".", IPA_vowels)
end
for _, suffix_syl in ipairs(suffix_syllables) do
insert(syllables_IPA, suffix_syl)
end
return syllables_IPA
end
local function convertir_palabra(word, dialect, pos)
local suffix_syllables = {}
local orig_word = word
if not pos or pos == ADV then
local word_before_ment, ment = strmatch(word, "^(.*)(m[eé]nt)$")
if word_before_ment and (pos == ADV or not strfind(word_before_ment, "[iï]$") and
strfind(word_before_ment, V .. ".*" .. V)) then
suffix_syllables = {{onset = "m", vowel = "e", coda = "nt", stressed = true}}
pos = ADJ
word = word_before_ment
end
end
word = word_fixes(word, dialect)
local syllables = split_syllables(word)
syllables = preprocess_word(syllables, suffix_syllables, dialect, pos, orig_word)
-- Combine syllables.
local combined = {}
local has_ment = #suffix_syllables > 0
for i, syll in ipairs(syllables) do
local ac = (i == syllables.stress and not syllables.is_prefix and not has_ment or has_ment and i == #syllables) and AC or -- primary stress
syllables[i].stressed and GR or -- secondary stress
""
insert(combined, syll.onset .. syll.vowel .. ac .. syll.coda)
end
return concat(combined, ".")
end
local function generar_pron_aux(text, dialect, pos)
local convertido = {}
local fragmentos = strsplit(text, "%s*|%s*")
local k = 1
for _,fragmento in ipairs(fragmentos) do
local palabras = strsplit(fragmento, "%s")
palabras = handle_unstressed_words(palabras)
local palabras_convertidas = {}
for _,p in ipairs(palabras) do
insert(palabras_convertidas, convertir_palabra(p, dialect, pos[k]))
k = k + 1
end
insert(convertido, concat(palabras_convertidas, " "))
end
-- Put double ## at utterance boundaries (beginning/end of string) and at foot boundaries (marked with |).
-- Note that if the string without pound signs is 'foo bar baz | bat quux', the final string will be
-- '##foo# #bar# #baz## #|# ##bat# #quux##'.
local texto_convertido = "##" .. concat(convertido, "# | #") .. "##"
texto_convertido = strsub(texto_convertido, " ", "# #")
return postprocess_general(texto_convertido, dialect)
end
local function generar_pron(text, pos)
if strfind(text, "[áìùÁÌÙ]") then
error(("Invalid accented character in respelling '%s'; use accented à í ú, not the reversed versions"):format(text))
end
text = normalizar(text)
local conv_cen = generar_pron_aux(text, CENTRAL, pos)
local conv_val = generar_pron_aux(text, VALENCIANO, pos)
local conv_bal = generar_pron_aux(text, BALEARICO, pos)
return {{"central"}, {"valenciano"}, {"baleárico"}}, {{strhtml(conv_cen)}, {strhtml(conv_val)}, {strhtml(conv_bal)}}
end
--Se obtiene el tipo de acentuación
local function determinar_acentuacion(w)
if type(w) ~= "string" then
return nil
end
local silabas = {}
for s in strmatchit(w, "[^"..sylsep_l.."]+") do
table.insert(silabas, s)
end
local L = #silabas
local sufijo = nil
if L >= 4 and silabas[L-1] == "men" and silabas[L] == "te" then
return "doble", L
elseif L == 1 then
return "monosílaba", L
else
local i = 1
for silaba in strmatchit(w, sylsep_c..'*'.."[^"..sylsep_l.."]+") do
if strfind(silaba, "ˈ") then
local idx = L - i
if idx == 0 then
return "aguda", L
elseif idx == 1 then
return "llana", L
elseif idx == 2 then
return "esdrújula", L
else
return "sobreesdrújula", L
end
break
end
i = i + 1
end
error("Se esperaba que la pronunciación de la palabra hubiera sido generada con las marcas de acentuación")
end
end
function export.procesar_pron_args(titulo, args)
local tit = titulo
local vino_ayuda, x
if #args["ayuda"] < 1 then
args["ayuda"][1] = tit
else
vino_ayuda = true
end
if #args["fone"] < 1 and #args["fono"] < 1 then
x = pron_abc[args["ayuda"][1]]
if x then
args["ayuda"] = x
args["tl"] = x
end
local A = #args["ayuda"]
local j = 1 -- indice de la ayuda
local k = 1 -- cantidad de pronunciaciones insertadas (máximo 9)
while k <= 9 and j <= A do
local cg = {}
local flags = args["ayudaextra"][j] and strsplit(args["ayudaextra"][j], ";") or {}
for _,flag in ipairs(flags) do
local z = normalizar_cg[flag]
if z then
insert(cg, z)
end
end
local pron, fone = generar_pron(args["ayuda"][j], cg)
for i,_ in ipairs(fone) do
insert(args["pron"], pron[i])
insert(args["fone"], fone[i])
if vino_ayuda then
insert(args["fgraf"], {args["ayuda"][j]})
end
k = k + 1
if k > 9 then
break
end
end
j = j + 1
end
end
local tiene_espacios = strfind(tit, "%s")
if args["fone"][1] and args["fone"][1][1] then
local rim = strsub(args["fone"][1][1], ".*%s([^%s]+)$", "%1") -- me quedo con la última palabra
rim = strsub(rim, "^.*ˈ(.-)$", "%1")
args["rima"][1] = strsub(rim, ".-".."("..IPA_VOWEL_CLUSTER..".*"..")".."$", "%1")
end
if not tiene_espacios then
if args["fone"][1] and args["fone"][1][1] then
local ls, ac = {}, {}
for _,f in ipairs(args["fone"]) do
local ace, lon = determinar_acentuacion(f[1])
ls[lon] = true
ac[ace] = true
end
for lon,_ in pairs(ls) do
insert(args["ls"], lon)
end
for ace,_ in pairs(ac) do
insert(args["ac"], ace)
end
end
end
return args
end
return export