Pergi ke kandungan

Modul:lt-common

Daripada Wikikamus

local export = {}

local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = mw.ustring.char
local ugsub = mw.ustring.gsub
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper

local grave = u(0x0300)
local acute = u(0x0301)
local tilde = u(0x0303)
local macron = u(0x0304)
local dotabove = u(0x0307)
local caron = u(0x030C)
local ogonek = u(0x0328)
local accents = "[" .. grave .. acute .. tilde .. "]"

local dotless_to_dotted = {
	["ı"] = "i",
	["ȷ"] = "j",
}

local function char_to_dotted_form(base, below)
	return (dotless_to_dotted[base] or base) .. below
end

local function dots_to_entryname_form(text)
	-- Remove any dots above, and convert dotless forms to dotted.
	return (ugsub(text, "([iıjȷ])(" .. ogonek .. "?)" .. dotabove, char_to_dotted_form))
end

local function char_to_accent_form(base, below)
	-- Add a 'dot above' after the base.
	if base == "i" or base == "j" then
		return base .. below .. dotabove
	end
	-- Convert any dotless chars combining with accents to the dotted form, so
	-- that they normalize properly. This shouldn't happen, but just in case.
	return char_to_dotted_form(base, below)
end

function export.makeDisplayText(text, lang, sc)
	-- Normalize any dots to the entryname form (while retaining accents).
	text = dots_to_entryname_form(toNFD(text))
	-- Add a 'dot above' between "i" or "j" and an accent.
	text = ugsub(text, "([iıjȷ])(" .. ogonek .. "?)%f" .. accents, char_to_accent_form)
	return toNFC(text)
end

local function entryname_form(text)
	-- Remove accents.
	text = ugsub(toNFD(text), accents .. "+", "")
	-- Normalize dots.
	return dots_to_entryname_form(text)
end

function export.makeEntryName(text, lang, sc)
	return toNFC(entryname_form(text))
end

local sortkey_substitutes = {
	[ogonek] = u(0xF000),
	[caron] = u(0xF001),
	[macron] = u(0xF002),
	[dotabove] = u(0xF003),
	["y"] = "i" .. u(0xF004),
}

function export.makeSortKey(text, lang, sc)
	-- Normalize to the entryname form.
	text = entryname_form(ulower(text))
		:gsub(".[\128-\191]*", sortkey_substitutes)
	return toNFC(uupper(text))
end

return export