/** * HTML Parser Module * Story 1.3: Content Migration Script * * Parses HTML files from Webflow to extract structured data * Used when JSON export is not available */ import type { WebflowExportData } from './types' import { toSlug, cleanHTML, htmlToPlainText } from './utils' import { load as cheerioLoad, CheerioAPI } from 'cheerio' // ============================================================ // MAIN PARSER FUNCTION // ============================================================ /** * Parse HTML content and extract Webflow data */ export function parseWebflowHTML(html: string, sourceUrl?: string): WebflowExportData { const $ = cheerioLoad(html) return { posts: extractPosts($), categories: extractCategories($), portfolio: extractPortfolio($), } } /** * Parse HTML file from disk */ export async function parseHTMLFile(filePath: string): Promise { const { readFile } = await import('fs/promises') const html = await readFile(filePath, 'utf-8') return parseWebflowHTML(html) } // ============================================================ // POST EXTRACTION // ============================================================ /** * Extract blog posts from HTML * This is a generic extractor - customize based on actual Webflow HTML structure */ function extractPosts($: CheerioAPI): Array<{ title: string slug: string content: string publishedDate: string postCategory?: string featuredImage?: string seoTitle?: string seoDescription?: string }> { const posts: any[] = [] // Common Webflow blog post selectors const postSelectors = [ '.w-dyn-item', // Webflow collection item '.blog-post', '.post-item', 'article', ] for (const selector of postSelectors) { const items = $(selector) if (items.length > 0) { items.each((_index, element) => { const $item = $(element) // Extract title const title = $item.find('h1, h2, h3, .post-title, .blog-title').first().text().trim() || $item.find('[data-field="title"]').text().trim() if (!title) return // Extract slug from link or data attribute const link = $item.find('a').first().attr('href') || '' const slug = link ? link.split('/').filter(Boolean).pop() : toSlug(title) // Extract content const contentEl = $item.find('.post-content, .blog-content, .content').first() const content = contentEl.length ? cleanHTML(contentEl.html() || '') : '' // Extract date const dateStr = $item.find('.post-date, .blog-date, .date, time').first().text().trim() || $item.find('time').first().attr('datetime') || new Date().toISOString() // Extract category const category = $item.find('.post-category, .blog-category, .category').first().text().trim() || $item.find('[data-field="category"]').text().trim() // Extract image const image = $item.find('img').first().attr('src') || $item.find('[data-field="featured-image"]').attr('src') || undefined // Extract SEO meta const seoTitle = $item.find('meta[property="og:title"]').attr('content') || $item.find('[data-field="seo-title"]').attr('content') || undefined const seoDescription = $item.find('meta[property="og:description"]').attr('content') || $item.find('[data-field="seo-description"]').attr('content') || undefined posts.push({ title, slug: slug || toSlug(title), content, publishedDate: dateStr, postCategory: category || undefined, featuredImage: image, seoTitle, seoDescription, }) }) // If we found posts, break if (posts.length > 0) { break } } } return posts } // ============================================================ // CATEGORY EXTRACTION // ============================================================ /** * Extract categories from HTML */ function extractCategories($: CheerioAPI): Array<{ name: string slug: string colorHex?: string }> { const categories: any[] = [] // Common category selectors const categorySelectors = [ '.category-link', '.post-category', '.filter-category', '[data-field="category"]', ] const uniqueCategories = new Set() for (const selector of categorySelectors) { const items = $(selector) if (items.length > 0) { items.each((_index, element) => { const $item = $(element) const name = $item.text().trim() || $item.attr('data-category') || '' if (name && !uniqueCategories.has(name)) { uniqueCategories.add(name) // Try to extract color from style attribute const style = $item.attr('style') || '' const colorMatch = style.match(/color:\s*#?([a-f0-9]{6}|[a-f0-9]{3})/i) const colorHex = colorMatch ? `#${colorMatch[1]}` : undefined categories.push({ name, slug: toSlug(name), colorHex, }) } }) } } // Known categories from the story const knownCategories = [ { name: 'Google小學堂', slug: 'google-workshop' }, { name: 'Meta小學堂', slug: 'meta-workshop' }, { name: '行銷時事最前線', slug: 'marketing-news' }, { name: '恩群數位最新公告', slug: 'enchun-announcements' }, ] // Merge known categories if no categories found if (categories.length === 0) { return knownCategories.map((cat) => ({ ...cat, colorHex: '#0066cc', // Default blue color })) } return categories } // ============================================================ // PORTFOLIO EXTRACTION // ============================================================ /** * Extract portfolio items from HTML */ function extractPortfolio($: CheerioAPI): Array<{ name: string slug: string websiteLink: string previewImage: string description: string websiteType: 'corporate' | 'ecommerce' | 'landing' | 'brand' | 'other' tags: string }> { const portfolio: any[] = [] // Common portfolio selectors const portfolioSelectors = [ '.portfolio-item', '.work-item', '.project-item', '.case-study', ] for (const selector of portfolioSelectors) { const items = $(selector) if (items.length > 0) { items.each((_index, element) => { const $item = $(element) // Extract title/name const name = $item.find('h2, h3, h4, .portfolio-title, .project-title').first().text().trim() || $item.find('[data-field="name"]').text().trim() if (!name) return // Extract link const link = $item.find('a').first().attr('href') || $item.find('[data-field="website-link"]').attr('href') || '' // Extract image const image = $item.find('img').first().attr('src') || $item.find('[data-field="preview-image"]').attr('src') || '' // Extract description const description = $item.find('.portfolio-description, .project-description, .description') .first() .text() .trim() || '' // Extract tags const tags = $item.find('.tag, .tags').first().text().trim() || '' // Determine website type from tags or class const typeClass = Array.from(element.classList).find((c) => ['corporate', 'ecommerce', 'landing', 'brand', 'other'].includes(c), ) const websiteType = (typeClass as any) || 'other' portfolio.push({ name, slug: toSlug(name), websiteLink: link, previewImage: image, description, websiteType, tags, }) }) // If we found portfolio items, break if (portfolio.length > 0) { break } } } return portfolio } // ============================================================ // URL EXTRACTION // ============================================================ /** * Extract all image URLs from HTML */ export function extractImageUrls(html: string): string[] { const $ = cheerioLoad(html) const urls = new Set() $('img').each((_index, element) => { const src = $(element).attr('src') const dataSrc = $(element).attr('data-src') const srcset = $(element).attr('srcset') if (src) urls.add(src) if (dataSrc) urls.add(dataSrc) if (srcset) { srcset.split(',').forEach((s) => { const url = s.trim().split(' ')[0] if (url) urls.add(url) }) } }) return Array.from(urls) } /** * Extract all media URLs from parsed data */ export function extractMediaUrls(data: WebflowExportData): string[] { const urls = new Set() // From posts if (data.posts) { for (const post of data.posts) { if (post.featuredImage) urls.add(post.featuredImage) } } // From portfolio if (data.portfolio) { for (const item of data.portfolio) { if (item.previewImage) urls.add(item.previewImage) } } return Array.from(urls) }