Incredible majestic beautiful md parser

2024-02-28 01:08:34 -07:00
parent e1ed37d733
commit 6ef8c20149
12 changed files with 674 additions and 128 deletions
--- a/lib/tcmd/index.ts
+++ b/lib/tcmd/index.ts
@@ -0,0 +1,226 @@
+import { zipArrays } from "../zip";
+import { inlineTokens } from "./inlineTokens";
+import { singleLineTokens } from "./singleLineTokens";
+import { tokenizeBlock } from "./tokenizeBlock";
+import { tokenizeParagraph } from "./tokenizeParagraph";
+
+export const createElements = (body: string) => {
+  const tokens = tokenize(body);
+
+  return tokens;
+};
+
+const tokenize = (body: string) => {
+  const paragraphs = body.split("\n\n");
+
+  const blockTokens: BlockToken[] = [];
+  const paragraphTokens: ParagraphToken[] = [];
+
+  for (const paragraph of paragraphs) {
+    const block = tokenizeBlock(paragraph);
+    let openBT = blockTokens.findLast((bt) => !bt.closed);
+    if (block) {
+      if (typeof block === "string") {
+        console.log(block);
+        if (openBT) {
+          openBT.closed = true;
+        }
+        continue;
+      }
+
+      if (openBT) {
+        openBT.children.push(block);
+        block.parent = openBT.type;
+      }
+      blockTokens.push(block);
+      continue;
+    }
+
+    if (!openBT) {
+      openBT = {
+        children: [],
+        closed: false,
+        metadata: {},
+        type: "block",
+      };
+      blockTokens.push(openBT);
+    }
+
+    const multiline = tokenizeParagraph(paragraph);
+    let openP = paragraphTokens.findLast((p) => !p.closed);
+    if (multiline) {
+      if (Array.isArray(multiline)) {
+        if (openP) {
+          openP.closed = true;
+          openP.content = openP.content.concat(multiline);
+        }
+        continue;
+      }
+
+      openBT.children.push(multiline);
+      paragraphTokens.push(multiline);
+      continue;
+    } else if (openP && !openP?.allowsInline) {
+      openP.content.push({
+        line: paragraph,
+        raw: paragraph,
+        type: "text",
+      });
+    }
+
+    // I don't think the closed check is necessary, but just in case
+    // if (openP && !openP.closed && !openP.allowsInline) continue;
+    if (!openP) {
+      openP = {
+        allowsInline: true,
+        closed: true,
+        content: [],
+        metadata: {},
+        type: "p",
+      };
+      openBT.children.push(openP);
+      paragraphTokens.push(openP);
+    }
+
+    const lines = paragraph.split("\n");
+    let previous;
+    for (const line of lines) {
+      const singleLine = tokenizeLine(line, previous);
+
+      if (singleLine) {
+        if (singleLine !== previous) {
+          openP.content.push(singleLine);
+        }
+        previous = singleLine;
+      }
+    }
+  }
+
+  return blockTokens.filter((b) => !b.parent);
+};
+
+// const __tokenize = (md: string) => {
+//   const tokens: (Token)[] = [];
+//   // md = md.replace(/(?<=[a-z])\n(?=[a-z])/g, " ");
+//   const lines = md.split("\n");
+//   let preserveEmpty = false;
+//   let multilineLines;
+//   let tokenSettings;
+
+//   for (let line of lines) {
+//     if (!line && !preserveEmpty) continue;
+//     let foundLine = false;
+
+//     if (!multilineLines) {
+//       token:
+//       for (const token of multilineTokens) {
+//         if (!token.rx.test(line)) continue token;
+//         tokenSettings = token;
+//         multilineLines = token.create(tokens);
+//         preserveEmpty = true;
+//         foundLine = true;
+//         multilineLines.push({
+//           type: "text",
+//           line: token.replace(line),
+//         });
+//       }
+//     } else {
+//       foundLine = true;
+//       if (tokenSettings?.closeRx?.test(line) || tokenSettings?.rx.test(line)) {
+//         tokenSettings = undefined;
+//         multilineLines = undefined;
+//         preserveEmpty = false;
+//       } else {
+//         multilineLines.push({
+//           type: "text",
+//           line,
+//         });
+//       }
+//     }
+
+//     if (!multilineLines) {
+//       token:
+//       for (const token of singleLineTokens) {
+//         if (!token.rx.test(line)) continue token;
+//         foundLine = true;
+//         line = line.replace(token.replaceRx, "").trim();
+
+//         const lineContent = tokenizeInline(line);
+//         token.create(lineContent, tokens);
+//       }
+//     }
+
+//     if (foundLine) continue;
+
+//     tokens.push({
+//       type: "text",
+//       line: tokenizeInline(line),
+//     });
+//   }
+
+//   return tokens;
+// };
+
+const tokenizeLine = (
+  line: string,
+  previous?: SingleLineToken,
+): SingleLineToken => {
+  for (const token of singleLineTokens) {
+    if (!token.rx.test(line)) continue;
+
+    const t = token.create(line);
+
+    if (t.type === "h2") {
+    }
+
+    t.line = tokenizeInline(line.replace(token.replaceRx, ""));
+    return t;
+  }
+
+  if (previous?.mends) {
+    previous.raw += " " + line;
+    previous.line = tokenizeInline(previous.raw.replace(previous.cfg!.rx, ""));
+    return previous;
+  }
+
+  return {
+    line: tokenizeInline(line),
+    type: "text",
+    raw: line,
+  };
+};
+
+const tokenizeInline = (line: string) => {
+  line = line.trim();
+  const originalLine = line;
+  const insertMarker = "\u{03A9}";
+  const tokens: InlineTokenInsert[] = [];
+
+  for (const token of inlineTokens) {
+    token.rx.lastIndex = 0;
+    let match;
+    while ((match = token.rx.exec(line)) !== null) {
+      const tokenStart = match.index;
+      const tokenEnd = match.index + match[0].length;
+
+      token.create(match, tokenStart, tokenEnd, tokens);
+    }
+  }
+
+  if (tokens.length) {
+    for (const insert of tokens) {
+      line = line.slice(0, insert.start) +
+        "".padStart(insert.end - insert.start, insertMarker) +
+        line.slice(insert.end, line.length);
+    }
+
+    return zipArrays(
+      line.split(new RegExp(insertMarker + "{2,}")).map((t): InlineToken => ({
+        content: t,
+        type: "text",
+      })),
+      tokens,
+    ).filter((t) => t.content);
+  }
+  return originalLine;
+};
--- a/lib/tcmd/inlineTokens.ts
+++ b/lib/tcmd/inlineTokens.ts
@@ -0,0 +1,45 @@
+const joiner = "<><>";
+export const inlineTokens: {
+  rx: RegExp;
+  create: (
+    content: RegExpExecArray,
+    start: number,
+    end: number,
+    tokens: InlineTokenInsert[],
+  ) => void;
+  replace: (line: string) => string;
+}[] = [
+  {
+    rx: /(\*\*)(.*?)(\*\*)/g,
+    create(content, start, end, tokens) {
+      tokens.push({
+        content: this.replace(content[0]),
+        type: "bold",
+        end,
+        start,
+      });
+    },
+    replace(l) {
+      return l.replace(this.rx, (_, __, val) => val);
+    },
+  },
+  {
+    rx: /\[(.*?)\]\((.*?)\)/g,
+    create(content, start, end, tokens) {
+      const [_, label, href] = content;
+      tokens.push({
+        content: label,
+        type: "anchor",
+        data: {
+          href,
+        },
+        start,
+        end,
+      });
+    },
+    replace(l) {
+      return l.replace(this.rx, (_, label, href) => [label, href].join(joiner));
+      // return l
+    },
+  },
+];
--- a/lib/tcmd/singleLineTokens.ts
+++ b/lib/tcmd/singleLineTokens.ts
@@ -0,0 +1,39 @@
+export const singleLineTokens: SingleLineCfg[] = [
+  {
+    rx: /^#\s/,
+    create(line) {
+      return ({ type: "h1", line, raw: line, cfg: this });
+    },
+    replaceRx: /^#\s/,
+  },
+  {
+    rx: /^##\s/,
+    create(line) {
+      return ({ type: "h2", line, raw: line, cfg: this });
+    },
+    replaceRx: /^##\s/,
+  },
+  {
+    rx: /^###\s/,
+    create(line) {
+      return ({ type: "h3", line, raw: line, cfg: this });
+    },
+    replaceRx: /^###\s/,
+  },
+  {
+    rx: /^-\s/,
+    create(line) {
+      return ({ type: "list1", line, raw: line, mends: true, cfg: this });
+    },
+    replaceRx: /^-\s/,
+    shouldMendNextLine: true,
+  },
+  {
+    rx: /^[\t\s]{2}-\s/,
+    create(line) {
+      return ({ type: "list2", line, raw: line, mends: true, cfg: this });
+    },
+    replaceRx: /^[\t\s]{2}-\s/,
+    shouldMendNextLine: true,
+  },
+];
--- a/lib/tcmd/tokenizeBlock.ts
+++ b/lib/tcmd/tokenizeBlock.ts
@@ -0,0 +1,44 @@
+export const tokenizeBlock = (paragraph: string) => {
+  for (const block of blockTokens) {
+    const openTest = block.rx.test(paragraph),
+      closeTest = block.closeRx.test(paragraph);
+
+    if (closeTest) return block.create(paragraph).type;
+    if (!openTest) continue;
+    return block.create(paragraph);
+  }
+};
+
+const blockTokens: {
+  rx: RegExp;
+  closeRx: RegExp;
+  create: (line: string) => BlockToken;
+}[] = [
+  // this indicates that this is a grid block, all paragraphs within this block will be placed in a number of columns that match the number of sets of brackets are in this line
+  {
+    rx: /^(\[\]){2,}/g,
+    closeRx: /\/\[\]/,
+    create(line) {
+      return {
+        type: "grid",
+        metadata: {
+          columns: line.match(/\[\]/g)?.length,
+        },
+        children: [],
+        closed: false,
+      };
+    },
+  },
+  {
+    rx: /^(\[\[)/,
+    closeRx: /\]\]/,
+    create() {
+      return {
+        type: "card",
+        metadata: {},
+        children: [],
+        closed: false,
+      };
+    },
+  },
+];
--- a/lib/tcmd/tokenizeParagraph.ts
+++ b/lib/tcmd/tokenizeParagraph.ts
@@ -0,0 +1,42 @@
+export const tokenizeParagraph = (paragraph: string) => {
+  for (const block of blockTokens) {
+    const openTest = block.rx.test(paragraph),
+      closeTest = block.closeRx.test(paragraph);
+    if (openTest && closeTest) {
+      const p = block.create(paragraph);
+      p.closed = true;
+      return p;
+    }
+    if (closeTest) return block.create(paragraph).content;
+
+    if (openTest) {
+      return block.create(paragraph);
+    }
+  }
+};
+
+const blockTokens: {
+  rx: RegExp;
+  closeRx: RegExp;
+  create: (line: string) => ParagraphToken;
+}[] = [
+  {
+    rx: /^```/g,
+    closeRx: /\n```/g,
+    create(line) {
+      return {
+        type: "code",
+        metadata: {
+          language: line.split("\n").at(0)!.replace(this.rx, ""),
+        },
+        closed: false,
+        content: [{
+          line: line.replace(/```.*?\n/g, "").replace(/\n```/, ""),
+          type: "text",
+          raw: line,
+        }],
+        allowsInline: false,
+      };
+    },
+  },
+];