source ⟩ hypertext ⟩ public ⟩ basa ⟩ dictionary ⟩ annexe.js

/* Structure of a dictionary entry:
- headword [partofspeech.] definitions <- etymology -> notes */

import * as fs from "fs";
import * as path from "path";

const langs = {
	en: {
		abbr: "Eng.",
		name: "English",
		els: ["i"],
		tags: ["en"]
	},
	zh: {
		abbr: "Man.",
		name: "Mandarin Chinese",
		els: ["span", "i"],
		tags: ["zh-Hans", "zh-Latn"]
	},
	hi: {
		abbr: "H.–U.",
		name: "Hindi–Urdu",
		els: ["span", "span", "i"],
		tags: ["hi-Deva", "ur-Arab", "hi-Latn"]
	},
	es: {
		abbr: "Sp.",
		name: "Spanish (Castillian)",
		els: ["i"],
		tags: ["es"]
	},
	fr: {
		abbr: "Fr.",
		name: "French",
		els: ["i"],
		tags: ["fr"]
	},
	pt: {
		abbr: "Pt.",
		name: "Portuguese",
		els: ["i"],
		tags: ["pt"]
	},
	ar: {
		abbr: "Ar.",
		name: "Arabic",
		els: ["span", "i"],
		tags: ["ar-Arab", "ar-Latn"]
	},
	ms: {
		abbr: "M.–I.",
		name: "Malay–Indonesian",
		els: ["i"],
		tags: ["ms"]
	},
	ru: {
		abbr: "Rus.",
		name: "Russian",
		els: ["span", "i"],
		tags: ["ru-Cyrl", "ru-Latn"]
	},
	sw: {
		abbr: "Sw.",
		name: "Swahili",
		els: ["i"],
		tags: ["sw"]
	},
	la: {
		abbr: "Lat.",
		name: "Latin",
		els: ["i"],
		tags: ["la"]
	},
	grc: {
		abbr: "A.Gk.",
		name: "Ancient Greek",
		els: ["span", "i"],
		tags: ["grc-Grek", "grc-Latn"]
	},
	fa: {
		abbr: "Per.",
		name: "Persian",
		els: ["span", "i"],
		tags: ["fa-Arab", "fa-Latn"]
	}
};

const markLanguage = brackets => {
	const code = brackets.match(/[a-z]+/)[0];
	const abbr = `<abbr title="${langs[code].name}">${langs[code].abbr}</abbr>`;
	const words = brackets.match(/\[[a-z]+ (.*?)\]/);

	if (!words) {
		return abbr;
	}

	return (
		abbr +
		" " +
		words[1]
			.split(" / ")
			.map(
				(word, idx) =>
					`<${langs[code].els[idx]} lang="${langs[code].tags[idx]}">${word}</>`
			)
			.join(" ")
	);
};

const hyphenate = text => text.replace(/ /g, "-");

const mark = entry => {
	const match = entry.match(/^- (.*?) (\[.*?)(?: <- (.*?))?(?: -> (.*?))?$/);

	return {
		url: hyphenate(match[1]),
		headword: match[1],
		definition: match[2]
			.replace("[n.]", "<abbr title='noun'>n.</abbr>")
			.replace("[v.]", "<abbr title='verb'>v.</abbr>")
			.replace("[a.]", "<abbr title='adjective'>a.</abbr>")
			.replace("[adv.]", "<abbr title='adverb'>adv.</abbr>")
			.replace("[p.n.]", "<abbr title='proper noun'>p.n.</abbr>")
			.replace("[prn.]", "<abbr title='pronoun'>prn.</abbr>")
			.replace("[det.]", "<abbr title='determiner'>det.</abbr>")
			.replace("[prep.]", "<abbr title='preposition'>prep.</abbr>")
			.replace("[part.]", "<abbr title='particle'>part.</abbr>")
			.replace("[aff.]", "<abbr title='affix'>aff.</abbr>")
			.replace("[num.]", "<abbr title='numeral'>num.</abbr>")
			.replace("[] ", ""),
		etymon: !match[3]
			? undefined
			: match[3]
					.replace(
						/\[(?:en|zh|hi|es|fr|pt|ar|ru|sw|ms|la|grc|fa)(?: .*?)?\]/g,
						markLanguage
					)
					.replace("[ditto]", "”")
					.replace("[+]", "” +")
					.replace(
						/\[# (.*?)\]/g,
						(match, word) =>
							`<a href="#${hyphenate(
								word
							)}" lang="art-x-basa"><strong>${word}</strong></a>`
					),
		note: !match[4]
			? undefined
			: match[4]
					.replace("[see]", "▶")
					.replace(
						/\[# (.*?)\]/g,
						(match, word) =>
							`<a href="#${hyphenate(
								word
							)}" lang="art-x-basa"><strong>${word}</strong></a>`
					)
	};
};

const branch = word =>
	!word.match("\n")
		? { headword: mark(word) }
		: {
				headword: mark(word.match(/^.*?$/m)[0]),
				children: word
					.replace(/^\t/gm, "")
					.split(/\n(?=-)/)
					.slice(1)
					.map(branch)
		  };

const tree = trunk => trunk.split(/\n(?=-)/).map(branch);

const dictionary = dict => dict.split("\n\n").map(tree);

const enToBasa = fs
	.readFileSync("/home/atossa/server/satyrsforest/hypertext/public/basa/dictionary/dict.txt", { encoding: "utf8" })
	.replace(/\r\n|\n\r|\r|\n/g, "\n");

export default output;