Pergi ke kandungan

Modul:etymon/categories

Daripada Wikikamus
local export = {}

local M = require("Module:module loader").init({
	require = {
		etymology = "Module:etymology",
		affix = "Module:affix",
		etymology_specialized = "Module:etymology/specialized",
		utilities = "Module:utilities",
	},
	loadData = {
		data = "Module:etymon/data",
	},
})

-- Evaluate whether a keyword is transitive for a given term
local function is_transitive(transitive_mode, page_lang, term_lang)
	if transitive_mode == M.data.TRANSITIVE.ALWAYS then
		return true
	elseif transitive_mode == M.data.TRANSITIVE.NEVER then
		return false
	elseif transitive_mode == M.data.TRANSITIVE.CROSS_LANG then
		return page_lang:getCode() ~= term_lang:getCode()
	elseif transitive_mode == M.data.TRANSITIVE.CROSS_LANG_NO_INTERNAL_SOURCE then
		return page_lang:getCode() ~= term_lang:getCode()
	end
	error("Unknown transitive mode: " .. tostring(transitive_mode))
end

-- Get keyword config with language-specific overrides
local function get_keyword_config(keyword, lang_exc)
	local base_config = M.data.keywords[keyword]
	if not base_config then
		return nil -- Invalid keyword
	end

	local overrides = lang_exc and lang_exc.keyword_overrides and lang_exc.keyword_overrides[keyword]

	if not overrides then
		return base_config
	end

	-- Merge overrides into base config
	local merged = {}
	for k, v in pairs(base_config) do
		merged[k] = v
	end
	for k, v in pairs(overrides) do
		merged[k] = v
	end
	return merged
end

function export.get_cat_name(source)
	local _, cat_name = M.etymology.get_display_and_cat_name(source, true)
	return cat_name
end

-- Normalize affix type aliases
local aftype_aliases = {
	["pre"] = "prefix",
	["suf"] = "suffix",
	["in"] = "infix",
	["inter"] = "interfix",
	["circum"] = "circumfix",
	["naf"] = "non-affix",
	["root"] = "non-affix",
}

local function add_category(categories, cat_name, sort_key, sort_base)
	if categories[cat_name] == nil then
		categories[cat_name] = {
			sort_key = sort_key,
			sort_base = sort_base,
		}
		return
	end

	local existing = categories[cat_name]
	if existing.sort_key == nil and sort_key ~= nil then
		existing.sort_key = sort_key
	end
	if existing.sort_base == nil and sort_base ~= nil then
		existing.sort_base = sort_base
	end
end

-- Collect affix categories from top-level group containers
local function collect_affix_categories(node, page_lang, available_etymon_ids, senseid_parent_etymon, lang_exc)
	local parts = {}
	local part_index = 1

	for _, container in ipairs(node.children or {}) do
		local config = container.keyword_info
		if config and config.affix_categories then
			for _, term in ipairs(container.terms or {}) do
				if not term.unknown_term then
					local part_data = {
						term = term.title,
						tr = term.tr,
						ts = term.ts,
						alt = term.alt,
						itemno = part_index,
						orig_index = part_index
					}

					-- Determine affix type: explicit aftype > pos=root > auto-detect
					local aftype = term.aftype
					if aftype then
						aftype = aftype_aliases[aftype] or aftype
						part_data.type = aftype
					elseif term.args and term.args.pos and term.args.pos == "root" then
						part_data.type = "non-affix"
					end

					if term.lang:getCode() ~= page_lang:getCode() then
						part_data.lang = term.lang
					end

					local target_ids = available_etymon_ids[term.target_key]
					local has_multiple_ids = target_ids and #target_ids > 1
					local id_exists_in_disambiguation = false
					local matched_id = nil

					-- Count available senseids for the target page
					local senseid_count = 0
					local target_prefix = term.target_key .. ":"
					if senseid_parent_etymon then
						for key, _ in pairs(senseid_parent_etymon) do
							if key:sub(1, #target_prefix) == target_prefix then
								senseid_count = senseid_count + 1
							end
						end
					end
					local has_multiple_senseids = senseid_count > 1

					if term.id then
						-- Check if user provided a valid senseid
						local senseid_key = term.target_key .. ":" .. term.id
						if senseid_parent_etymon and senseid_parent_etymon[senseid_key] then
							if has_multiple_senseids then
								-- Ambiguous senseid: use senseid
								matched_id = term.id
								id_exists_in_disambiguation = true
							elseif has_multiple_ids then
								-- Unique senseid but ambiguous etymon: use etymon ID
								matched_id = term.etymon_id or term.id
								id_exists_in_disambiguation = true
							end
						else
							-- Check if user provided a valid etymon ID
							if has_multiple_ids and target_ids then
								for _, id_data in ipairs(target_ids) do
									local stored_id = type(id_data) == "table" and id_data.id or id_data
									if stored_id == term.id then
										-- Ambiguous etymon: use etymon ID
										id_exists_in_disambiguation = true
										matched_id = term.id
										break
									end
								end
							end
							
							-- Fallback: check resolved etymon_id (e.g. from previous steps)
							if not id_exists_in_disambiguation and has_multiple_ids and term.etymon_id and target_ids then
								for _, id_data in ipairs(target_ids) do
									local stored_id = type(id_data) == "table" and id_data.id or id_data
									if stored_id == term.etymon_id then
										id_exists_in_disambiguation = true
										matched_id = term.etymon_id
										break
									end
								end
							end
						end
					end

					-- Use the matched ID if found
					if term.override or id_exists_in_disambiguation then
						part_data.id = matched_id or term.id
					end

					table.insert(parts, part_data)
					part_index = part_index + 1
				end
			end
		end
	end

	if #parts == 0 then return {} end

	local affix_data = {
		lang = page_lang,
		parts = parts,
		pos = "term",
		sort_key = nil,
	}
	
	if #parts == 1 then
		affix_data.allow_no_affixes_or_compounds = true
	end

	local affix_categories = M.affix.get_affix_categories_only(affix_data)

	local result = {}
	for _, cat in ipairs(affix_categories) do
		if type(cat) == "table" then
			table.insert(result, { cat = cat.cat, sort_key = cat.sort_key, sort_base = cat.sort_base })
		else
			table.insert(result, { cat = cat })
		end
	end

	return result
end

-- Add borrowing-related categories (top-level only)
local function collect_borrowing_categories(categories, page_lang, term, config)
	if config.borrowing_type == "borrowed" then
		local temp_categories = {}
		M.etymology.insert_borrowed_cat(temp_categories, page_lang, term.lang)
		for _, cat in ipairs(temp_categories) do
			add_category(categories, cat)
		end
	end

	if config.specialized_borrowing then
		local result = M.etymology_specialized.specialized_borrowing {
			bortype = config.specialized_borrowing,
			lang = page_lang,
			sources = { term.lang },
			terms = { { lang = term.lang, term = "-" } },
			notext = true,
			nocat = false,
		}

		for cat_name in result:gmatch("%[%[Category:([^%]]+)%]%]") do
			add_category(categories, cat_name)
		end
	end
end

-- Add source-based derivation categories (top-level only)
local function collect_source_derivation_categories(categories, page_lang, term, config)
	if not config.source_category_type then
		return
	end

	local temp_categories = {}

	M.etymology.insert_source_cat_get_display {
		lang = page_lang,
		source = term.lang,
		categories = temp_categories,
		borrowing_type = config.source_category_type,
		nocat = false,
	}

	for _, cat in ipairs(temp_categories) do
		add_category(categories, cat)
	end
end

-- Add source language categories
local function collect_source_categories(categories, page_lang, term, chain, get_norm_lang_func)
	if page_lang:getCode() == get_norm_lang_func(term.lang):getCode() then
		return
	end

	local temp_categories = {}
	M.etymology.insert_source_cat_get_display {
		lang = page_lang,
		source = term.lang,
		categories = temp_categories,
		nocat = false,
	}

	for _, cat in ipairs(temp_categories) do
		add_category(categories, cat)
	end

	if chain.inherited then
		temp_categories = {}
		M.etymology.insert_source_cat_get_display {
			lang = page_lang,
			source = term.lang,
			categories = temp_categories,
			borrowing_type = "dipinjam",
			nocat = false,
		}

		for _, cat in ipairs(temp_categories) do
			add_category(categories, cat)
		end
	end
end

-- Add root/word categories
local function collect_pos_categories(categories, page_lang, root_title, term, available_etymon_ids, chain,
									  get_norm_lang_func, lang_exc, keyword)
	local pos_types = { root = "root", word = "word" }

	-- Determine pos: from term's postype, keyword's pos_override, or args.pos
	local pos
	local config = get_keyword_config(keyword, lang_exc)
	if term.postype then
		-- Term-level postype modifier takes highest priority
		pos = term.postype
	elseif config and config.pos_override then
		pos = config.pos_override
	elseif type(term.args) == "table" and term.args.pos then
		pos = term.args.pos
	end

	local pos_type = pos_types[pos]

	if not pos_type or term.unknown_term then
		return
	end

	-- Skip root/word categories for descendants of affix groups
	-- if pos_type then
	-- 	return
	-- end

	local same_language = get_norm_lang_func(page_lang):getFullCode() == get_norm_lang_func(term.lang):getFullCode()

	-- Skip self-references
	if same_language and root_title == term.title then
		return
	end

	-- Use makeEntryName to strip diacritics for category names
	local entry_name = term.lang:makeEntryName(term.title)

	local lang_name = page_lang:getCanonicalName()
	local cat_name

	if chain.passed_through then
		local etymon_lang_name = export.get_cat_name(term.lang)
		cat_name = "Perkataan " .. lang_name .. " diterbitkan daripada " .. etymon_lang_name .. " " .. pos_type .. " " .. entry_name
	else
		cat_name = "Perkataan " .. lang_name .. " milik " .. pos_type .. " " .. entry_name
	end

	-- Add ID disambiguation if needed (for roots/words: use etymon_id if resolved via senseid, otherwise use id)
	local target_ids = available_etymon_ids[term.target_key]
	local effective_id = term.etymon_id or term.id  -- etymon_id if senseid, otherwise id is already an etymon id
	if target_ids and effective_id then
		local same_pos_count = 0
		for _, id_data in ipairs(target_ids) do
			if type(id_data) == "table" and id_data.pos == pos then
				same_pos_count = same_pos_count + 1
			end
		end
		if same_pos_count > 1 then
			cat_name = cat_name .. " (" .. effective_id .. ")"
		end
	end

	add_category(categories, cat_name)
end

-- Compute chain state for a term based on parent chain and keyword config
-- Hyphen patterns for affix detection (regular hyphen + script-specific)
local AFFIX_HYPHEN_PATTERN = "[%-%־ـ᠊]" -- regular hyphen, Hebrew maqqef, Arabic tatweel, Mongolian hyphen

-- Check if a term is an actual affix (not a non-affix member of an affix group)
local function is_actual_affix(term)
	-- Check explicit aftype modifier
	if term.aftype then
		local normalized = aftype_aliases[term.aftype] or term.aftype
		return normalized ~= "non-affix"
	end
	-- Check if pos=root (treated as non-affix)
	if term.args and term.args.pos and term.args.pos == "root" then
		return false
	end
	-- Auto-detect by hyphen: prefix ends with -, suffix starts with -, etc.
	if term.title then
		local title = term.title
		-- Strip leading * for reconstructed terms before checking hyphens
		title = title:gsub("^%*", "")
		-- Check for hyphens at start or end (handles script-specific hyphens too)
		if title:match("^" .. AFFIX_HYPHEN_PATTERN) or title:match(AFFIX_HYPHEN_PATTERN .. "$") then
			return true
		end
	end
	-- Default: not an affix
	return false
end

local function compute_category_chain(parent_chain, config, page_lang, term_lang, get_norm_lang_func, parent_term_lang, term)
	-- Track if we're inside an actual affix (for suppressing root categories on descendants)
	-- Only set if the term is an actual affix (prefix, suffix, etc.), not a non-affix member
	local inside_affix = parent_chain.inside_affix
	if config.affix_categories and term and is_actual_affix(term) then
		inside_affix = true
	end

	-- If no_child_categories is set, disable everything
	if config.no_child_categories then
		return {
			passed_through = parent_chain.passed_through or page_lang:getCode() ~= get_norm_lang_func(term_lang):getCode(),
			inherited = false,
			source = false,
			pos = false,
			recurse = false,
			inside_affix = inside_affix,
		}
	end

	local term_is_transitive = is_transitive(config.transitive, page_lang, term_lang)
	local new_source = parent_chain.source and term_is_transitive
	
	-- For CROSS_LANG_NO_INTERNAL_SOURCE: track internal derivation language context
	-- Check if this term is internal relative to parent term's language (if parent_term_lang provided)
	-- or relative to page language (if no parent_term_lang)
	local internal_lang = parent_chain.internal_lang
	local is_internal_in_context = false
	if config.transitive == M.data.TRANSITIVE.CROSS_LANG_NO_INTERNAL_SOURCE then
		local check_lang = parent_term_lang or page_lang
		local term_lang_code = get_norm_lang_func(term_lang):getCode()
		local check_lang_code = get_norm_lang_func(check_lang):getCode()
		
		if internal_lang then
			-- Already in an internal derivation context: check if this term is also internal
			is_internal_in_context = term_lang_code == internal_lang
		else
			-- Check if this term is internal relative to parent term (or page if no parent)
			is_internal_in_context = term_lang_code == check_lang_code
		end
	end
	
	-- Source chain behavior for CROSS_LANG_NO_INTERNAL_SOURCE
	if config.transitive == M.data.TRANSITIVE.CROSS_LANG_NO_INTERNAL_SOURCE then
		if is_internal_in_context then
			-- Internal derivation
			new_source = false
			internal_lang = get_norm_lang_func(term_lang):getCode()
		else
			-- Cross-language
			new_source = parent_chain.source and term_is_transitive
			internal_lang = nil
		end
	end
	
	local new_pos = parent_chain.pos

	return {
		passed_through = parent_chain.passed_through or page_lang:getCode() ~= get_norm_lang_func(term_lang):getCode(),
		inherited = parent_chain.inherited and config.inherited_chain,
		source = new_source,
		pos = new_pos,
		internal_lang = internal_lang,
		recurse = new_source or new_pos,
		inside_affix = inside_affix,
	}
end

function export.render(opts)
	opts = opts or {}
	local data_tree = opts.data_tree
	local page_lang = opts.page_lang
	local available_etymon_ids = opts.available_etymon_ids
	local senseid_parent_etymon = opts.senseid_parent_etymon
	local get_norm_lang_func = opts.get_norm_lang_func
	local lang_exc = opts.lang_exc

	local categories = {}
	local seen = {}
	local lang_name = page_lang:getCanonicalName()
	local root_title = data_tree.title

	-- Collect the tree recursively
	local function collect(node, parent_chain, is_toplevel)
		-- Avoid processing same node twice
		if not node.unknown_term and node.title then
			local key = node.lang:getFullCode() .. ":" .. (node.title or "") .. ":" .. (node.id or "")
			if seen[key] then return end
			seen[key] = true
		end

		-- Collect affix categories at top level only
		if is_toplevel then
			local affix_cats = collect_affix_categories(node, page_lang, available_etymon_ids, senseid_parent_etymon, lang_exc)
			for _, cat in ipairs(affix_cats) do
				add_category(categories, lang_name .. " " .. cat.cat, cat.sort_key, cat.sort_base)
			end
		end

		-- Process each container
		for _, container in ipairs(node.children or {}) do
			local keyword = container.keyword
			local config = get_keyword_config(keyword, lang_exc)

			-- Skip invalid keywords
			if config then
				-- Process each term in the container
				for _, term in ipairs(container.terms or {}) do
					local term_chain = compute_category_chain(parent_chain, config, page_lang, term.lang, get_norm_lang_func, node.lang, term)
					local no_child_categories = config.no_child_categories == true
					local term_is_transitive = is_transitive(config.transitive, page_lang, term.lang)

					-- Top-level only processing
					if is_toplevel then
						-- Missing/ambiguous etymon tracking
						if not term.unknown_term and (term.status == M.data.STATUS.MISSING or term.status == M.data.STATUS.REDLINK) then
							add_category(categories, lang_name .. " entries referencing missing etymons")
						end
						if not term.unknown_term and term.status == M.data.STATUS.AMBIGUOUS then
							add_category(categories, lang_name .. " entries referencing ambiguous etymons")
						end
						if term.missing_descendants_header then
							add_category(categories, lang_name .. " entries referencing etymons without Descendants sections")
						end
						if term.missing_descendants_entry then
							add_category(categories, lang_name .. " entries referencing etymons without this term in Descendants sections")
						end

						-- Top-level category (e.g., "undefined derivations")
						if config.toplevel_category then
							add_category(categories, lang_name .. " " .. config.toplevel_category)
						end

						-- Borrowing categories (bor, lbor, slbor, ubor, obor)
						if config.borrowing_type or config.specialized_borrowing then
							collect_borrowing_categories(categories, page_lang, term, config)
						end

						-- Borrowing categories from <bor>, <lbor>, or <slbor> modifiers on :af/:surf terms
						if keyword == "affix" or keyword == "surf" then
							if term.bor then
								local bor_config = { borrowing_type = "borrowed" }
								collect_borrowing_categories(categories, page_lang, term, bor_config)
							elseif term.lbor then
								local bor_config = { specialized_borrowing = "learned" }
								collect_borrowing_categories(categories, page_lang, term, bor_config)
							elseif term.slbor then
								local bor_config = { specialized_borrowing = "semi-learned" }
								collect_borrowing_categories(categories, page_lang, term, bor_config)
							end
						end

						-- Source-based derivation categories (sl, calque, pcal)
						if config.source_category_type then
							collect_source_derivation_categories(categories, page_lang, term, config)
						end

						-- Skip all child categorisation if no_child_categories is set
						if not no_child_categories then
							-- Source categories only if transitive
							if term_is_transitive then
								collect_source_categories(categories, page_lang, term, term_chain, get_norm_lang_func)
							end

							-- Pos categories always (unless no_child_categories)
							collect_pos_categories(categories, page_lang, root_title, term, available_etymon_ids, term_chain,
								get_norm_lang_func, lang_exc, keyword)
						end
					else
						-- Below top level, respect the parent chain
						if parent_chain.source then
							collect_source_categories(categories, page_lang, term, term_chain, get_norm_lang_func)
						end

						if parent_chain.pos then
							collect_pos_categories(categories, page_lang, root_title, term, available_etymon_ids, term_chain,
								get_norm_lang_func, lang_exc, keyword)
						end
					end

					-- Recurse into term's children if needed and status allows
					if term_chain.recurse and (term.status == M.data.STATUS.OK or term.status == M.data.STATUS.INLINE) then
						collect(term, term_chain, false)
					end
				end
			end
		end
	end

	-- Initial chain state
	local initial_chain = {
		passed_through = false,
		inherited = true,
		source = true,
		pos = true,
		internal_lang = nil,
		recurse = true,
		inside_affix = false,
	}

	collect(data_tree, initial_chain, true)

	local cat_list = {}
	for cat_name, sort_data in pairs(categories) do
		if sort_data.sort_key ~= nil or sort_data.sort_base ~= nil then
			table.insert(cat_list, {
				name = cat_name,
				sort_key = sort_data.sort_key,
				sort_base = sort_data.sort_base,
			})
		else
			table.insert(cat_list, cat_name)
		end
	end
	return cat_list
end

function export.format(entries, lang)
	if type(entries) ~= "table" or #entries == 0 then
		return ""
	end

	local parts = {}
	for _, category in ipairs(entries) do
		if type(category) == "table" and type(category.name) == "string" then
			table.insert(parts, M.utilities.format_categories({ category.name }, lang, category.sort_key, category.sort_base))
		elseif type(category) == "string" then
			table.insert(parts, M.utilities.format_categories({ category }, lang))
		end
	end

	return table.concat(parts)
end

return export