feat(backend): update collections, config and migration tools
Update Payload CMS configuration, collections (Audit, Posts), and add migration scripts/reports.
This commit is contained in:
349
apps/backend/scripts/migration/htmlParser.ts
Normal file
349
apps/backend/scripts/migration/htmlParser.ts
Normal file
@@ -0,0 +1,349 @@
|
||||
/**
|
||||
* HTML Parser Module
|
||||
* Story 1.3: Content Migration Script
|
||||
*
|
||||
* Parses HTML files from Webflow to extract structured data
|
||||
* Used when JSON export is not available
|
||||
*/
|
||||
|
||||
import type { WebflowExportData } from './types'
|
||||
import { toSlug, cleanHTML, htmlToPlainText } from './utils'
|
||||
import { load as cheerioLoad, CheerioAPI } from 'cheerio'
|
||||
|
||||
// ============================================================
|
||||
// MAIN PARSER FUNCTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Parse HTML content and extract Webflow data
|
||||
*/
|
||||
export function parseWebflowHTML(html: string, sourceUrl?: string): WebflowExportData {
|
||||
const $ = cheerioLoad(html)
|
||||
|
||||
return {
|
||||
posts: extractPosts($),
|
||||
categories: extractCategories($),
|
||||
portfolio: extractPortfolio($),
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse HTML file from disk
|
||||
*/
|
||||
export async function parseHTMLFile(filePath: string): Promise<WebflowExportData> {
|
||||
const { readFile } = await import('fs/promises')
|
||||
const html = await readFile(filePath, 'utf-8')
|
||||
return parseWebflowHTML(html)
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// POST EXTRACTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Extract blog posts from HTML
|
||||
* This is a generic extractor - customize based on actual Webflow HTML structure
|
||||
*/
|
||||
function extractPosts($: CheerioAPI): Array<{
|
||||
title: string
|
||||
slug: string
|
||||
content: string
|
||||
publishedDate: string
|
||||
postCategory?: string
|
||||
featuredImage?: string
|
||||
seoTitle?: string
|
||||
seoDescription?: string
|
||||
}> {
|
||||
const posts: any[] = []
|
||||
|
||||
// Common Webflow blog post selectors
|
||||
const postSelectors = [
|
||||
'.w-dyn-item', // Webflow collection item
|
||||
'.blog-post',
|
||||
'.post-item',
|
||||
'article',
|
||||
]
|
||||
|
||||
for (const selector of postSelectors) {
|
||||
const items = $(selector)
|
||||
|
||||
if (items.length > 0) {
|
||||
items.each((_index, element) => {
|
||||
const $item = $(element)
|
||||
|
||||
// Extract title
|
||||
const title =
|
||||
$item.find('h1, h2, h3, .post-title, .blog-title').first().text().trim() ||
|
||||
$item.find('[data-field="title"]').text().trim()
|
||||
|
||||
if (!title) return
|
||||
|
||||
// Extract slug from link or data attribute
|
||||
const link = $item.find('a').first().attr('href') || ''
|
||||
const slug = link
|
||||
? link.split('/').filter(Boolean).pop()
|
||||
: toSlug(title)
|
||||
|
||||
// Extract content
|
||||
const contentEl = $item.find('.post-content, .blog-content, .content').first()
|
||||
const content = contentEl.length ? cleanHTML(contentEl.html() || '') : ''
|
||||
|
||||
// Extract date
|
||||
const dateStr =
|
||||
$item.find('.post-date, .blog-date, .date, time').first().text().trim() ||
|
||||
$item.find('time').first().attr('datetime') ||
|
||||
new Date().toISOString()
|
||||
|
||||
// Extract category
|
||||
const category =
|
||||
$item.find('.post-category, .blog-category, .category').first().text().trim() ||
|
||||
$item.find('[data-field="category"]').text().trim()
|
||||
|
||||
// Extract image
|
||||
const image =
|
||||
$item.find('img').first().attr('src') ||
|
||||
$item.find('[data-field="featured-image"]').attr('src') ||
|
||||
undefined
|
||||
|
||||
// Extract SEO meta
|
||||
const seoTitle =
|
||||
$item.find('meta[property="og:title"]').attr('content') ||
|
||||
$item.find('[data-field="seo-title"]').attr('content') ||
|
||||
undefined
|
||||
|
||||
const seoDescription =
|
||||
$item.find('meta[property="og:description"]').attr('content') ||
|
||||
$item.find('[data-field="seo-description"]').attr('content') ||
|
||||
undefined
|
||||
|
||||
posts.push({
|
||||
title,
|
||||
slug: slug || toSlug(title),
|
||||
content,
|
||||
publishedDate: dateStr,
|
||||
postCategory: category || undefined,
|
||||
featuredImage: image,
|
||||
seoTitle,
|
||||
seoDescription,
|
||||
})
|
||||
})
|
||||
|
||||
// If we found posts, break
|
||||
if (posts.length > 0) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return posts
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CATEGORY EXTRACTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Extract categories from HTML
|
||||
*/
|
||||
function extractCategories($: CheerioAPI): Array<{
|
||||
name: string
|
||||
slug: string
|
||||
colorHex?: string
|
||||
}> {
|
||||
const categories: any[] = []
|
||||
|
||||
// Common category selectors
|
||||
const categorySelectors = [
|
||||
'.category-link',
|
||||
'.post-category',
|
||||
'.filter-category',
|
||||
'[data-field="category"]',
|
||||
]
|
||||
|
||||
const uniqueCategories = new Set<string>()
|
||||
|
||||
for (const selector of categorySelectors) {
|
||||
const items = $(selector)
|
||||
|
||||
if (items.length > 0) {
|
||||
items.each((_index, element) => {
|
||||
const $item = $(element)
|
||||
const name = $item.text().trim() || $item.attr('data-category') || ''
|
||||
|
||||
if (name && !uniqueCategories.has(name)) {
|
||||
uniqueCategories.add(name)
|
||||
|
||||
// Try to extract color from style attribute
|
||||
const style = $item.attr('style') || ''
|
||||
const colorMatch = style.match(/color:\s*#?([a-f0-9]{6}|[a-f0-9]{3})/i)
|
||||
const colorHex = colorMatch ? `#${colorMatch[1]}` : undefined
|
||||
|
||||
categories.push({
|
||||
name,
|
||||
slug: toSlug(name),
|
||||
colorHex,
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Known categories from the story
|
||||
const knownCategories = [
|
||||
{ name: 'Google小學堂', slug: 'google-workshop' },
|
||||
{ name: 'Meta小學堂', slug: 'meta-workshop' },
|
||||
{ name: '行銷時事最前線', slug: 'marketing-news' },
|
||||
{ name: '恩群數位最新公告', slug: 'enchun-announcements' },
|
||||
]
|
||||
|
||||
// Merge known categories if no categories found
|
||||
if (categories.length === 0) {
|
||||
return knownCategories.map((cat) => ({
|
||||
...cat,
|
||||
colorHex: '#0066cc', // Default blue color
|
||||
}))
|
||||
}
|
||||
|
||||
return categories
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// PORTFOLIO EXTRACTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Extract portfolio items from HTML
|
||||
*/
|
||||
function extractPortfolio($: CheerioAPI): Array<{
|
||||
name: string
|
||||
slug: string
|
||||
websiteLink: string
|
||||
previewImage: string
|
||||
description: string
|
||||
websiteType: 'corporate' | 'ecommerce' | 'landing' | 'brand' | 'other'
|
||||
tags: string
|
||||
}> {
|
||||
const portfolio: any[] = []
|
||||
|
||||
// Common portfolio selectors
|
||||
const portfolioSelectors = [
|
||||
'.portfolio-item',
|
||||
'.work-item',
|
||||
'.project-item',
|
||||
'.case-study',
|
||||
]
|
||||
|
||||
for (const selector of portfolioSelectors) {
|
||||
const items = $(selector)
|
||||
|
||||
if (items.length > 0) {
|
||||
items.each((_index, element) => {
|
||||
const $item = $(element)
|
||||
|
||||
// Extract title/name
|
||||
const name =
|
||||
$item.find('h2, h3, h4, .portfolio-title, .project-title').first().text().trim() ||
|
||||
$item.find('[data-field="name"]').text().trim()
|
||||
|
||||
if (!name) return
|
||||
|
||||
// Extract link
|
||||
const link =
|
||||
$item.find('a').first().attr('href') ||
|
||||
$item.find('[data-field="website-link"]').attr('href') ||
|
||||
''
|
||||
|
||||
// Extract image
|
||||
const image =
|
||||
$item.find('img').first().attr('src') ||
|
||||
$item.find('[data-field="preview-image"]').attr('src') ||
|
||||
''
|
||||
|
||||
// Extract description
|
||||
const description =
|
||||
$item.find('.portfolio-description, .project-description, .description')
|
||||
.first()
|
||||
.text()
|
||||
.trim() || ''
|
||||
|
||||
// Extract tags
|
||||
const tags = $item.find('.tag, .tags').first().text().trim() || ''
|
||||
|
||||
// Determine website type from tags or class
|
||||
const typeClass = Array.from(element.classList).find((c) =>
|
||||
['corporate', 'ecommerce', 'landing', 'brand', 'other'].includes(c),
|
||||
)
|
||||
const websiteType = (typeClass as any) || 'other'
|
||||
|
||||
portfolio.push({
|
||||
name,
|
||||
slug: toSlug(name),
|
||||
websiteLink: link,
|
||||
previewImage: image,
|
||||
description,
|
||||
websiteType,
|
||||
tags,
|
||||
})
|
||||
})
|
||||
|
||||
// If we found portfolio items, break
|
||||
if (portfolio.length > 0) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return portfolio
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// URL EXTRACTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Extract all image URLs from HTML
|
||||
*/
|
||||
export function extractImageUrls(html: string): string[] {
|
||||
const $ = cheerioLoad(html)
|
||||
const urls = new Set<string>()
|
||||
|
||||
$('img').each((_index, element) => {
|
||||
const src = $(element).attr('src')
|
||||
const dataSrc = $(element).attr('data-src')
|
||||
const srcset = $(element).attr('srcset')
|
||||
|
||||
if (src) urls.add(src)
|
||||
if (dataSrc) urls.add(dataSrc)
|
||||
if (srcset) {
|
||||
srcset.split(',').forEach((s) => {
|
||||
const url = s.trim().split(' ')[0]
|
||||
if (url) urls.add(url)
|
||||
})
|
||||
}
|
||||
})
|
||||
|
||||
return Array.from(urls)
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all media URLs from parsed data
|
||||
*/
|
||||
export function extractMediaUrls(data: WebflowExportData): string[] {
|
||||
const urls = new Set<string>()
|
||||
|
||||
// From posts
|
||||
if (data.posts) {
|
||||
for (const post of data.posts) {
|
||||
if (post.featuredImage) urls.add(post.featuredImage)
|
||||
}
|
||||
}
|
||||
|
||||
// From portfolio
|
||||
if (data.portfolio) {
|
||||
for (const item of data.portfolio) {
|
||||
if (item.previewImage) urls.add(item.previewImage)
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(urls)
|
||||
}
|
||||
Reference in New Issue
Block a user