/** * HTML to Lexical JSON Converter * Story 1.3: Content Migration Script * * Converts HTML content to Payload CMS Lexical editor format */ import { parse } from 'html-parse-stringify' // ============================================================ // LEXICAL JSON TYPES // ============================================================ interface LexicalNode { type: string version: number [key: string]: any } interface LexicalTextContent { type: 'text' version: 1 detail?: { 0: any; 1: any } format?: number mode?: string style?: string text: string } interface LexicalElementNode { type: 'element' | 'heading' | 'link' | 'list' | 'listitem' | 'quote' | 'paragraph' version: 1 children: LexicalContent[] direction?: 'ltr' | 'rtl' | null format?: '' | 'left' | 'start' | 'center' | 'right' | 'end' | 'justify' indent?: number tag?: string listType?: 'bullet' | 'number' rel?: null | string target?: null | string title?: null | string url?: string } interface LexicalLinebreakNode { type: 'linebreak' version: 1 } interface LexicalRoot { type: 'root' version: 1 children: LexicalElementNode[] direction: 'ltr' | 'rtl' | null } type LexicalContent = LexicalTextContent | LexicalElementNode | LexicalLinebreakNode // ============================================================ // HTML TO LEXICAL CONVERTER // ============================================================ /** * Convert HTML string to Lexical JSON format (returns object for Payload local API) * * IMPORTANT: Payload's richText field expects content wrapped in { "root": {...} } structure */ export function htmlToLexical(html: string): string { if (!html || typeof html !== 'string') { return createEmptyLexical() } // Clean the HTML first const cleanedHtml = cleanHtml(html) try { const ast = parse(cleanedHtml) const children = convertNodes(ast) // Clean up empty text nodes that Payload doesn't accept const cleanedChildren = cleanEmptyTextNodes(children) const lexicalObject = { type: 'root', version: 1, children: cleanedChildren.length > 0 ? cleanedChildren : [createEmptyParagraph()], direction: null, } satisfies LexicalRoot // Wrap in { "root": ... } structure for Payload's richText field // This is the format Payload expects when storing Lexical content return JSON.stringify({ root: lexicalObject }) } catch (error) { console.warn('Failed to parse HTML, using fallback:', error) return createTextLexical(cleanedHtml) } } /** * Convert HTML string to Lexical object (for direct use with Payload local API) * Returns { root: LexicalRoot } format for Payload richText field */ export function htmlToLexicalObject(html: string): { root: LexicalRoot } { if (!html || typeof html !== 'string') { return JSON.parse(createEmptyLexical()) } // Clean the HTML first const cleanedHtml = cleanHtml(html) try { const ast = parse(cleanedHtml) const children = convertNodes(ast) return { root: { type: 'root', version: 1, children: children.length > 0 ? children : [createEmptyParagraph()], direction: null, }, } } catch (error) { console.warn('Failed to parse HTML, using fallback:', error) return JSON.parse(createTextLexical(cleanedHtml)) } } /** * Create empty Lexical JSON structure */ function createEmptyLexical(): string { return JSON.stringify({ root: { type: 'root', version: 1, children: [createEmptyParagraph()], direction: null, }, }) } /** * Create Lexical JSON with plain text (fallback) */ function createTextLexical(text: string): string { return JSON.stringify({ root: { type: 'root', version: 1, children: [ { type: 'paragraph', version: 1, children: [createTextNode(text)], }, ], direction: null, }, }) } /** * Create an empty paragraph node */ function createEmptyParagraph(): LexicalElementNode { return { type: 'paragraph', version: 1, children: [createTextNode('')], } } /** * Clean empty text nodes from Lexical tree * Payload's Lexical validator rejects empty text nodes */ function cleanEmptyTextNodes(nodes: LexicalElementNode[]): LexicalElementNode[] { return nodes .map((node) => { // Clean children recursively if (node.children && Array.isArray(node.children)) { const cleanedChildren = node.children .filter((child: any) => { // Remove empty text nodes if (child.type === 'text' && child.text === '') { return false } return true }) .map((child: any) => { // If child has children, clean those too if (child.children && Array.isArray(child.children)) { return { ...child, children: child.children.filter((c: any) => { if (c.type === 'text' && c.text === '') { return false } return true }), } } return child }) // If all children were removed, add an empty text node if (cleanedChildren.length === 0) { return { ...node, children: [createTextNode('')] } } return { ...node, children: cleanedChildren } } return node }) .filter((node) => { // Remove nodes that became invalid after cleaning return node.type !== 'linebreak' }) } /** * Clean HTML by removing unwanted elements */ function cleanHtml(html: string): string { return html // Remove script and style tags .replace(/)<[^<]*)*<\/script>/gi, '') .replace(/)<[^<]*)*<\/style>/gi, '') // Remove Webflow-specific attributes .replace(/\sdata-[a-z-]+="[^"]*"/gi, '') .replace(/\sclass="[^"]*"/gi, '') // Clean up empty tags .replace(/

\s*<\/p>/gi, '') .replace(/

\s*<\/div>/gi, '') .trim() } /** * Convert HTML AST nodes to Lexical nodes */ function convertNodes(nodes: any[]): LexicalElementNode[] { const result: LexicalElementNode[] = [] let currentList: LexicalElementNode | null = null let listItems: LexicalElementNode[] = [] for (const node of nodes) { // Handle text nodes if (node.type === 'text') { const text = node.value?.trim() if (text) { result.push({ type: 'paragraph', version: 1, children: [createTextNode(text)], }) } continue } if (!node.name) continue const tag = node.name.toLowerCase() // Handle headings if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tag)) { flushList(result, currentList, listItems) currentList = null listItems = [] result.push(createHeading(tag, node.children || [])) continue } // Handle paragraphs if (tag === 'p') { flushList(result, currentList, listItems) currentList = null listItems = [] const content = convertInlineNodes(node.children || []) if (content.length > 0) { result.push({ type: 'paragraph', version: 1, children: content, }) } continue } // Handle lists if (tag === 'ul' || tag === 'ol') { flushList(result, currentList, listItems) currentList = { type: 'list', version: 1, listType: tag === 'ol' ? 'number' : 'bullet', children: [], } listItems = convertListItems(node.children || []) continue } // Handle blockquotes if (tag === 'blockquote') { flushList(result, currentList, listItems) currentList = null listItems = [] const content = convertInlineNodes(node.children || []) result.push({ type: 'quote', version: 1, children: content, }) continue } // Handle divs (treat as paragraphs) if (tag === 'div') { flushList(result, currentList, listItems) currentList = null listItems = [] const content = convertInlineNodes(node.children || []) if (content.length > 0) { result.push({ type: 'paragraph', version: 1, children: content, }) } continue } // Handle line breaks and horizontal rules if (tag === 'br') { result.push({ type: 'paragraph', version: 1, children: [{ type: 'linebreak', version: 1 } as any], }) continue } if (tag === 'hr') { result.push({ type: 'paragraph', version: 1, children: [createTextNode('---')], }) continue } // Handle images if (tag === 'img') { flushList(result, currentList, listItems) currentList = null listItems = [] const src = node.attributes?.src || '' const alt = node.attributes?.alt || '' result.push(createImageNode(src, alt)) continue } } // Flush any remaining list flushList(result, currentList, listItems) return result.length > 0 ? result : [createEmptyParagraph()] } /** * Flush pending list items to result */ function flushList( result: LexicalElementNode[], list: LexicalElementNode | null, items: LexicalElementNode[], ): void { if (list && items.length > 0) { list.children = items result.push(list) } } /** * Convert list items (li) to Lexical format */ function convertListItems(items: any[]): LexicalElementNode[] { return items .filter((item) => item.name?.toLowerCase() === 'li') .map((item) => ({ type: 'listitem', version: 1, children: convertInlineNodes(item.children || []), })) } /** * Create a standard text node with all required Lexical properties */ function createTextNode(text: string, format?: number): LexicalTextContent { return { type: 'text', version: 1, text, detail: 0, format: format ?? 0, mode: 'normal', style: '', } } /** * Convert inline nodes (text, links, formatting) */ function convertInlineNodes(nodes: any[]): LexicalContent[] { const result: LexicalContent[] = [] for (const node of nodes) { // Handle text nodes (html-parse-stringify uses type for text) if (node.type === 'text') { const text = (node.value || node.content || '') as string if (text) { result.push(createTextNode(text)) } continue } // Skip if no element name (not an element) if (!node.name && !node.type) continue const tag = node.name.toLowerCase() // Handle links // NOTE: Payload's Lexical link validation is very strict. For now, convert links to text // TODO: Implement proper link format after investigating Payload's link node requirements if (tag === 'a') { // Convert links to text with URL in parentheses const text = extractText(node.children || []) const href = node.attrs?.href || node.attributes?.href || '' if (text) { // Include URL as text for now const linkText = href && href !== '#' ? `${text} (${href})` : text result.push(createTextNode(linkText)) } continue } // Handle bold (strong, b) if (tag === 'strong' || tag === 'b') { const text = extractText(node.children || []) result.push(createTextNode(text, 1)) // Bold format continue } // Handle italic (em, i) if (tag === 'em' || tag === 'i') { const text = extractText(node.children || []) result.push(createTextNode(text, 2)) // Italic format continue } // Handle underline (u) if (tag === 'u') { const text = extractText(node.children || []) result.push(createTextNode(text, 4)) // Underline format continue } // Handle images inline if (tag === 'img') { const src = node.attrs?.src || node.attributes?.src || '' const alt = node.attrs?.alt || node.attributes?.alt || '' result.push(createImageNode(src, alt)) continue } // Handle spans (treat as text) if (tag === 'span') { const text = extractText(node.children || []) if (text) { result.push(createTextNode(text)) } continue } // Handle code if (tag === 'code') { const text = extractText(node.children || []) result.push({ ...createTextNode(text), style: 'font-family: monospace;', }) continue } // Recursively handle other inline elements const children = convertInlineNodes(node.children || []) result.push(...children) } return result.length > 0 ? result : [createTextNode('')] } /** * Create a heading node */ function createHeading(tag: string, children: any[]): LexicalElementNode { const tagNum = parseInt(tag.substring(1), 10) const inlineNodes = convertInlineNodes(children) return { type: 'heading', version: 1, tag: `h${tagNum}`, children: inlineNodes.length > 0 ? inlineNodes : [createTextNode('')], } } /** * Create an image node */ function createImageNode(src: string, alt: string): LexicalElementNode { return { type: 'paragraph', version: 1, children: [ { ...createTextNode(`[Image: ${alt || src}]`), style: 'font-style: italic;', }, ], } } /** * Extract plain text from nodes */ function extractText(nodes: any[]): string { let text = '' for (const node of nodes) { if (node.type === 'text') { text += node.value || node.content || '' } else if (node.children) { text += extractText(node.children) } else if (node.content) { text += node.content } } return text } // ============================================================ // UTILITY FUNCTIONS // ============================================================ /** * Check if a string is valid Lexical JSON */ export function isValidLexical(json: string): boolean { try { const parsed = JSON.parse(json) return parsed?.type === 'root' && Array.isArray(parsed?.children) } catch { return false } } /** * Convert multiple HTML contents to Lexical format */ export function batchHtmlToLexical(htmlArray: string[]): string[] { return htmlArray.map((html) => htmlToLexical(html)) }