/**
* HTML Parser Module
* Story 1.3: Content Migration Script
*
* Parses HTML files from Webflow to extract structured data
* Used when JSON export is not available
*/
import type { WebflowExportData } from './types'
import { toSlug, cleanHTML, htmlToPlainText } from './utils'
import { load as cheerioLoad, CheerioAPI } from 'cheerio'
// ============================================================
// MAIN PARSER FUNCTION
// ============================================================
/**
* Parse HTML content and extract Webflow data
*/
export function parseWebflowHTML(html: string, sourceUrl?: string): WebflowExportData {
const $ = cheerioLoad(html)
return {
posts: extractPosts($),
categories: extractCategories($),
portfolio: extractPortfolio($),
}
}
/**
* Parse HTML file from disk
*/
export async function parseHTMLFile(filePath: string): Promise {
const { readFile } = await import('fs/promises')
const html = await readFile(filePath, 'utf-8')
return parseWebflowHTML(html)
}
// ============================================================
// POST EXTRACTION
// ============================================================
/**
* Extract blog posts from HTML
* This is a generic extractor - customize based on actual Webflow HTML structure
*/
function extractPosts($: CheerioAPI): Array<{
title: string
slug: string
content: string
publishedDate: string
postCategory?: string
featuredImage?: string
seoTitle?: string
seoDescription?: string
}> {
const posts: any[] = []
// Common Webflow blog post selectors
const postSelectors = [
'.w-dyn-item', // Webflow collection item
'.blog-post',
'.post-item',
'article',
]
for (const selector of postSelectors) {
const items = $(selector)
if (items.length > 0) {
items.each((_index, element) => {
const $item = $(element)
// Extract title
const title =
$item.find('h1, h2, h3, .post-title, .blog-title').first().text().trim() ||
$item.find('[data-field="title"]').text().trim()
if (!title) return
// Extract slug from link or data attribute
const link = $item.find('a').first().attr('href') || ''
const slug = link
? link.split('/').filter(Boolean).pop()
: toSlug(title)
// Extract content
const contentEl = $item.find('.post-content, .blog-content, .content').first()
const content = contentEl.length ? cleanHTML(contentEl.html() || '') : ''
// Extract date
const dateStr =
$item.find('.post-date, .blog-date, .date, time').first().text().trim() ||
$item.find('time').first().attr('datetime') ||
new Date().toISOString()
// Extract category
const category =
$item.find('.post-category, .blog-category, .category').first().text().trim() ||
$item.find('[data-field="category"]').text().trim()
// Extract image
const image =
$item.find('img').first().attr('src') ||
$item.find('[data-field="featured-image"]').attr('src') ||
undefined
// Extract SEO meta
const seoTitle =
$item.find('meta[property="og:title"]').attr('content') ||
$item.find('[data-field="seo-title"]').attr('content') ||
undefined
const seoDescription =
$item.find('meta[property="og:description"]').attr('content') ||
$item.find('[data-field="seo-description"]').attr('content') ||
undefined
posts.push({
title,
slug: slug || toSlug(title),
content,
publishedDate: dateStr,
postCategory: category || undefined,
featuredImage: image,
seoTitle,
seoDescription,
})
})
// If we found posts, break
if (posts.length > 0) {
break
}
}
}
return posts
}
// ============================================================
// CATEGORY EXTRACTION
// ============================================================
/**
* Extract categories from HTML
*/
function extractCategories($: CheerioAPI): Array<{
name: string
slug: string
colorHex?: string
}> {
const categories: any[] = []
// Common category selectors
const categorySelectors = [
'.category-link',
'.post-category',
'.filter-category',
'[data-field="category"]',
]
const uniqueCategories = new Set()
for (const selector of categorySelectors) {
const items = $(selector)
if (items.length > 0) {
items.each((_index, element) => {
const $item = $(element)
const name = $item.text().trim() || $item.attr('data-category') || ''
if (name && !uniqueCategories.has(name)) {
uniqueCategories.add(name)
// Try to extract color from style attribute
const style = $item.attr('style') || ''
const colorMatch = style.match(/color:\s*#?([a-f0-9]{6}|[a-f0-9]{3})/i)
const colorHex = colorMatch ? `#${colorMatch[1]}` : undefined
categories.push({
name,
slug: toSlug(name),
colorHex,
})
}
})
}
}
// Known categories from the story
const knownCategories = [
{ name: 'Google小學堂', slug: 'google-workshop' },
{ name: 'Meta小學堂', slug: 'meta-workshop' },
{ name: '行銷時事最前線', slug: 'marketing-news' },
{ name: '恩群數位最新公告', slug: 'enchun-announcements' },
]
// Merge known categories if no categories found
if (categories.length === 0) {
return knownCategories.map((cat) => ({
...cat,
colorHex: '#0066cc', // Default blue color
}))
}
return categories
}
// ============================================================
// PORTFOLIO EXTRACTION
// ============================================================
/**
* Extract portfolio items from HTML
*/
function extractPortfolio($: CheerioAPI): Array<{
name: string
slug: string
websiteLink: string
previewImage: string
description: string
websiteType: 'corporate' | 'ecommerce' | 'landing' | 'brand' | 'other'
tags: string
}> {
const portfolio: any[] = []
// Common portfolio selectors
const portfolioSelectors = [
'.portfolio-item',
'.work-item',
'.project-item',
'.case-study',
]
for (const selector of portfolioSelectors) {
const items = $(selector)
if (items.length > 0) {
items.each((_index, element) => {
const $item = $(element)
// Extract title/name
const name =
$item.find('h2, h3, h4, .portfolio-title, .project-title').first().text().trim() ||
$item.find('[data-field="name"]').text().trim()
if (!name) return
// Extract link
const link =
$item.find('a').first().attr('href') ||
$item.find('[data-field="website-link"]').attr('href') ||
''
// Extract image
const image =
$item.find('img').first().attr('src') ||
$item.find('[data-field="preview-image"]').attr('src') ||
''
// Extract description
const description =
$item.find('.portfolio-description, .project-description, .description')
.first()
.text()
.trim() || ''
// Extract tags
const tags = $item.find('.tag, .tags').first().text().trim() || ''
// Determine website type from tags or class
const typeClass = Array.from(element.classList).find((c) =>
['corporate', 'ecommerce', 'landing', 'brand', 'other'].includes(c),
)
const websiteType = (typeClass as any) || 'other'
portfolio.push({
name,
slug: toSlug(name),
websiteLink: link,
previewImage: image,
description,
websiteType,
tags,
})
})
// If we found portfolio items, break
if (portfolio.length > 0) {
break
}
}
}
return portfolio
}
// ============================================================
// URL EXTRACTION
// ============================================================
/**
* Extract all image URLs from HTML
*/
export function extractImageUrls(html: string): string[] {
const $ = cheerioLoad(html)
const urls = new Set()
$('img').each((_index, element) => {
const src = $(element).attr('src')
const dataSrc = $(element).attr('data-src')
const srcset = $(element).attr('srcset')
if (src) urls.add(src)
if (dataSrc) urls.add(dataSrc)
if (srcset) {
srcset.split(',').forEach((s) => {
const url = s.trim().split(' ')[0]
if (url) urls.add(url)
})
}
})
return Array.from(urls)
}
/**
* Extract all media URLs from parsed data
*/
export function extractMediaUrls(data: WebflowExportData): string[] {
const urls = new Set()
// From posts
if (data.posts) {
for (const post of data.posts) {
if (post.featuredImage) urls.add(post.featuredImage)
}
}
// From portfolio
if (data.portfolio) {
for (const item of data.portfolio) {
if (item.previewImage) urls.add(item.previewImage)
}
}
return Array.from(urls)
}