Update Payload CMS configuration, collections (Audit, Posts), and add migration scripts/reports.
350 lines
9.1 KiB
TypeScript
350 lines
9.1 KiB
TypeScript
/**
|
|
* HTML Parser Module
|
|
* Story 1.3: Content Migration Script
|
|
*
|
|
* Parses HTML files from Webflow to extract structured data
|
|
* Used when JSON export is not available
|
|
*/
|
|
|
|
import type { WebflowExportData } from './types'
|
|
import { toSlug, cleanHTML, htmlToPlainText } from './utils'
|
|
import { load as cheerioLoad, CheerioAPI } from 'cheerio'
|
|
|
|
// ============================================================
|
|
// MAIN PARSER FUNCTION
|
|
// ============================================================
|
|
|
|
/**
|
|
* Parse HTML content and extract Webflow data
|
|
*/
|
|
export function parseWebflowHTML(html: string, sourceUrl?: string): WebflowExportData {
|
|
const $ = cheerioLoad(html)
|
|
|
|
return {
|
|
posts: extractPosts($),
|
|
categories: extractCategories($),
|
|
portfolio: extractPortfolio($),
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse HTML file from disk
|
|
*/
|
|
export async function parseHTMLFile(filePath: string): Promise<WebflowExportData> {
|
|
const { readFile } = await import('fs/promises')
|
|
const html = await readFile(filePath, 'utf-8')
|
|
return parseWebflowHTML(html)
|
|
}
|
|
|
|
// ============================================================
|
|
// POST EXTRACTION
|
|
// ============================================================
|
|
|
|
/**
|
|
* Extract blog posts from HTML
|
|
* This is a generic extractor - customize based on actual Webflow HTML structure
|
|
*/
|
|
function extractPosts($: CheerioAPI): Array<{
|
|
title: string
|
|
slug: string
|
|
content: string
|
|
publishedDate: string
|
|
postCategory?: string
|
|
featuredImage?: string
|
|
seoTitle?: string
|
|
seoDescription?: string
|
|
}> {
|
|
const posts: any[] = []
|
|
|
|
// Common Webflow blog post selectors
|
|
const postSelectors = [
|
|
'.w-dyn-item', // Webflow collection item
|
|
'.blog-post',
|
|
'.post-item',
|
|
'article',
|
|
]
|
|
|
|
for (const selector of postSelectors) {
|
|
const items = $(selector)
|
|
|
|
if (items.length > 0) {
|
|
items.each((_index, element) => {
|
|
const $item = $(element)
|
|
|
|
// Extract title
|
|
const title =
|
|
$item.find('h1, h2, h3, .post-title, .blog-title').first().text().trim() ||
|
|
$item.find('[data-field="title"]').text().trim()
|
|
|
|
if (!title) return
|
|
|
|
// Extract slug from link or data attribute
|
|
const link = $item.find('a').first().attr('href') || ''
|
|
const slug = link
|
|
? link.split('/').filter(Boolean).pop()
|
|
: toSlug(title)
|
|
|
|
// Extract content
|
|
const contentEl = $item.find('.post-content, .blog-content, .content').first()
|
|
const content = contentEl.length ? cleanHTML(contentEl.html() || '') : ''
|
|
|
|
// Extract date
|
|
const dateStr =
|
|
$item.find('.post-date, .blog-date, .date, time').first().text().trim() ||
|
|
$item.find('time').first().attr('datetime') ||
|
|
new Date().toISOString()
|
|
|
|
// Extract category
|
|
const category =
|
|
$item.find('.post-category, .blog-category, .category').first().text().trim() ||
|
|
$item.find('[data-field="category"]').text().trim()
|
|
|
|
// Extract image
|
|
const image =
|
|
$item.find('img').first().attr('src') ||
|
|
$item.find('[data-field="featured-image"]').attr('src') ||
|
|
undefined
|
|
|
|
// Extract SEO meta
|
|
const seoTitle =
|
|
$item.find('meta[property="og:title"]').attr('content') ||
|
|
$item.find('[data-field="seo-title"]').attr('content') ||
|
|
undefined
|
|
|
|
const seoDescription =
|
|
$item.find('meta[property="og:description"]').attr('content') ||
|
|
$item.find('[data-field="seo-description"]').attr('content') ||
|
|
undefined
|
|
|
|
posts.push({
|
|
title,
|
|
slug: slug || toSlug(title),
|
|
content,
|
|
publishedDate: dateStr,
|
|
postCategory: category || undefined,
|
|
featuredImage: image,
|
|
seoTitle,
|
|
seoDescription,
|
|
})
|
|
})
|
|
|
|
// If we found posts, break
|
|
if (posts.length > 0) {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
return posts
|
|
}
|
|
|
|
// ============================================================
|
|
// CATEGORY EXTRACTION
|
|
// ============================================================
|
|
|
|
/**
|
|
* Extract categories from HTML
|
|
*/
|
|
function extractCategories($: CheerioAPI): Array<{
|
|
name: string
|
|
slug: string
|
|
colorHex?: string
|
|
}> {
|
|
const categories: any[] = []
|
|
|
|
// Common category selectors
|
|
const categorySelectors = [
|
|
'.category-link',
|
|
'.post-category',
|
|
'.filter-category',
|
|
'[data-field="category"]',
|
|
]
|
|
|
|
const uniqueCategories = new Set<string>()
|
|
|
|
for (const selector of categorySelectors) {
|
|
const items = $(selector)
|
|
|
|
if (items.length > 0) {
|
|
items.each((_index, element) => {
|
|
const $item = $(element)
|
|
const name = $item.text().trim() || $item.attr('data-category') || ''
|
|
|
|
if (name && !uniqueCategories.has(name)) {
|
|
uniqueCategories.add(name)
|
|
|
|
// Try to extract color from style attribute
|
|
const style = $item.attr('style') || ''
|
|
const colorMatch = style.match(/color:\s*#?([a-f0-9]{6}|[a-f0-9]{3})/i)
|
|
const colorHex = colorMatch ? `#${colorMatch[1]}` : undefined
|
|
|
|
categories.push({
|
|
name,
|
|
slug: toSlug(name),
|
|
colorHex,
|
|
})
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
// Known categories from the story
|
|
const knownCategories = [
|
|
{ name: 'Google小學堂', slug: 'google-workshop' },
|
|
{ name: 'Meta小學堂', slug: 'meta-workshop' },
|
|
{ name: '行銷時事最前線', slug: 'marketing-news' },
|
|
{ name: '恩群數位最新公告', slug: 'enchun-announcements' },
|
|
]
|
|
|
|
// Merge known categories if no categories found
|
|
if (categories.length === 0) {
|
|
return knownCategories.map((cat) => ({
|
|
...cat,
|
|
colorHex: '#0066cc', // Default blue color
|
|
}))
|
|
}
|
|
|
|
return categories
|
|
}
|
|
|
|
// ============================================================
|
|
// PORTFOLIO EXTRACTION
|
|
// ============================================================
|
|
|
|
/**
|
|
* Extract portfolio items from HTML
|
|
*/
|
|
function extractPortfolio($: CheerioAPI): Array<{
|
|
name: string
|
|
slug: string
|
|
websiteLink: string
|
|
previewImage: string
|
|
description: string
|
|
websiteType: 'corporate' | 'ecommerce' | 'landing' | 'brand' | 'other'
|
|
tags: string
|
|
}> {
|
|
const portfolio: any[] = []
|
|
|
|
// Common portfolio selectors
|
|
const portfolioSelectors = [
|
|
'.portfolio-item',
|
|
'.work-item',
|
|
'.project-item',
|
|
'.case-study',
|
|
]
|
|
|
|
for (const selector of portfolioSelectors) {
|
|
const items = $(selector)
|
|
|
|
if (items.length > 0) {
|
|
items.each((_index, element) => {
|
|
const $item = $(element)
|
|
|
|
// Extract title/name
|
|
const name =
|
|
$item.find('h2, h3, h4, .portfolio-title, .project-title').first().text().trim() ||
|
|
$item.find('[data-field="name"]').text().trim()
|
|
|
|
if (!name) return
|
|
|
|
// Extract link
|
|
const link =
|
|
$item.find('a').first().attr('href') ||
|
|
$item.find('[data-field="website-link"]').attr('href') ||
|
|
''
|
|
|
|
// Extract image
|
|
const image =
|
|
$item.find('img').first().attr('src') ||
|
|
$item.find('[data-field="preview-image"]').attr('src') ||
|
|
''
|
|
|
|
// Extract description
|
|
const description =
|
|
$item.find('.portfolio-description, .project-description, .description')
|
|
.first()
|
|
.text()
|
|
.trim() || ''
|
|
|
|
// Extract tags
|
|
const tags = $item.find('.tag, .tags').first().text().trim() || ''
|
|
|
|
// Determine website type from tags or class
|
|
const typeClass = Array.from(element.classList).find((c) =>
|
|
['corporate', 'ecommerce', 'landing', 'brand', 'other'].includes(c),
|
|
)
|
|
const websiteType = (typeClass as any) || 'other'
|
|
|
|
portfolio.push({
|
|
name,
|
|
slug: toSlug(name),
|
|
websiteLink: link,
|
|
previewImage: image,
|
|
description,
|
|
websiteType,
|
|
tags,
|
|
})
|
|
})
|
|
|
|
// If we found portfolio items, break
|
|
if (portfolio.length > 0) {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
return portfolio
|
|
}
|
|
|
|
// ============================================================
|
|
// URL EXTRACTION
|
|
// ============================================================
|
|
|
|
/**
|
|
* Extract all image URLs from HTML
|
|
*/
|
|
export function extractImageUrls(html: string): string[] {
|
|
const $ = cheerioLoad(html)
|
|
const urls = new Set<string>()
|
|
|
|
$('img').each((_index, element) => {
|
|
const src = $(element).attr('src')
|
|
const dataSrc = $(element).attr('data-src')
|
|
const srcset = $(element).attr('srcset')
|
|
|
|
if (src) urls.add(src)
|
|
if (dataSrc) urls.add(dataSrc)
|
|
if (srcset) {
|
|
srcset.split(',').forEach((s) => {
|
|
const url = s.trim().split(' ')[0]
|
|
if (url) urls.add(url)
|
|
})
|
|
}
|
|
})
|
|
|
|
return Array.from(urls)
|
|
}
|
|
|
|
/**
|
|
* Extract all media URLs from parsed data
|
|
*/
|
|
export function extractMediaUrls(data: WebflowExportData): string[] {
|
|
const urls = new Set<string>()
|
|
|
|
// From posts
|
|
if (data.posts) {
|
|
for (const post of data.posts) {
|
|
if (post.featuredImage) urls.add(post.featuredImage)
|
|
}
|
|
}
|
|
|
|
// From portfolio
|
|
if (data.portfolio) {
|
|
for (const item of data.portfolio) {
|
|
if (item.previewImage) urls.add(item.previewImage)
|
|
}
|
|
}
|
|
|
|
return Array.from(urls)
|
|
}
|