/**
 * HTML Parser Module
 * Story 1.3: Content Migration Script
 *
 * Parses HTML files from Webflow to extract structured data
 * Used when JSON export is not available
 */

import type { WebflowExportData } from './types'
import { toSlug, cleanHTML, htmlToPlainText } from './utils'
import { load as cheerioLoad, CheerioAPI } from 'cheerio'

// ============================================================
// MAIN PARSER FUNCTION
// ============================================================

/**
 * Parse HTML content and extract Webflow data
 */
export function parseWebflowHTML(html: string, sourceUrl?: string): WebflowExportData {
  const $ = cheerioLoad(html)

  return {
    posts: extractPosts($),
    categories: extractCategories($),
    portfolio: extractPortfolio($),
  }
}

/**
 * Parse HTML file from disk
 */
export async function parseHTMLFile(filePath: string): Promise<WebflowExportData> {
  const { readFile } = await import('fs/promises')
  const html = await readFile(filePath, 'utf-8')
  return parseWebflowHTML(html)
}

// ============================================================
// POST EXTRACTION
// ============================================================

/**
 * Extract blog posts from HTML
 * This is a generic extractor - customize based on actual Webflow HTML structure
 */
function extractPosts($: CheerioAPI): Array<{
  title: string
  slug: string
  content: string
  publishedDate: string
  postCategory?: string
  featuredImage?: string
  seoTitle?: string
  seoDescription?: string
}> {
  const posts: any[] = []

  // Common Webflow blog post selectors
  const postSelectors = [
    '.w-dyn-item', // Webflow collection item
    '.blog-post',
    '.post-item',
    'article',
  ]

  for (const selector of postSelectors) {
    const items = $(selector)

    if (items.length > 0) {
      items.each((_index, element) => {
        const $item = $(element)

        // Extract title
        const title =
          $item.find('h1, h2, h3, .post-title, .blog-title').first().text().trim() ||
          $item.find('[data-field="title"]').text().trim()

        if (!title) return

        // Extract slug from link or data attribute
        const link = $item.find('a').first().attr('href') || ''
        const slug = link
          ? link.split('/').filter(Boolean).pop()
          : toSlug(title)

        // Extract content
        const contentEl = $item.find('.post-content, .blog-content, .content').first()
        const content = contentEl.length ? cleanHTML(contentEl.html() || '') : ''

        // Extract date
        const dateStr =
          $item.find('.post-date, .blog-date, .date, time').first().text().trim() ||
          $item.find('time').first().attr('datetime') ||
          new Date().toISOString()

        // Extract category
        const category =
          $item.find('.post-category, .blog-category, .category').first().text().trim() ||
          $item.find('[data-field="category"]').text().trim()

        // Extract image
        const image =
          $item.find('img').first().attr('src') ||
          $item.find('[data-field="featured-image"]').attr('src') ||
          undefined

        // Extract SEO meta
        const seoTitle =
          $item.find('meta[property="og:title"]').attr('content') ||
          $item.find('[data-field="seo-title"]').attr('content') ||
          undefined

        const seoDescription =
          $item.find('meta[property="og:description"]').attr('content') ||
          $item.find('[data-field="seo-description"]').attr('content') ||
          undefined

        posts.push({
          title,
          slug: slug || toSlug(title),
          content,
          publishedDate: dateStr,
          postCategory: category || undefined,
          featuredImage: image,
          seoTitle,
          seoDescription,
        })
      })

      // If we found posts, break
      if (posts.length > 0) {
        break
      }
    }
  }

  return posts
}

// ============================================================
// CATEGORY EXTRACTION
// ============================================================

/**
 * Extract categories from HTML
 */
function extractCategories($: CheerioAPI): Array<{
  name: string
  slug: string
  colorHex?: string
}> {
  const categories: any[] = []

  // Common category selectors
  const categorySelectors = [
    '.category-link',
    '.post-category',
    '.filter-category',
    '[data-field="category"]',
  ]

  const uniqueCategories = new Set<string>()

  for (const selector of categorySelectors) {
    const items = $(selector)

    if (items.length > 0) {
      items.each((_index, element) => {
        const $item = $(element)
        const name = $item.text().trim() || $item.attr('data-category') || ''

        if (name && !uniqueCategories.has(name)) {
          uniqueCategories.add(name)

          // Try to extract color from style attribute
          const style = $item.attr('style') || ''
          const colorMatch = style.match(/color:\s*#?([a-f0-9]{6}|[a-f0-9]{3})/i)
          const colorHex = colorMatch ? `#${colorMatch[1]}` : undefined

          categories.push({
            name,
            slug: toSlug(name),
            colorHex,
          })
        }
      })
    }
  }

  // Known categories from the story
  const knownCategories = [
    { name: 'Google小學堂', slug: 'google-workshop' },
    { name: 'Meta小學堂', slug: 'meta-workshop' },
    { name: '行銷時事最前線', slug: 'marketing-news' },
    { name: '恩群數位最新公告', slug: 'enchun-announcements' },
  ]

  // Merge known categories if no categories found
  if (categories.length === 0) {
    return knownCategories.map((cat) => ({
      ...cat,
      colorHex: '#0066cc', // Default blue color
    }))
  }

  return categories
}

// ============================================================
// PORTFOLIO EXTRACTION
// ============================================================

/**
 * Extract portfolio items from HTML
 */
function extractPortfolio($: CheerioAPI): Array<{
  name: string
  slug: string
  websiteLink: string
  previewImage: string
  description: string
  websiteType: 'corporate' | 'ecommerce' | 'landing' | 'brand' | 'other'
  tags: string
}> {
  const portfolio: any[] = []

  // Common portfolio selectors
  const portfolioSelectors = [
    '.portfolio-item',
    '.work-item',
    '.project-item',
    '.case-study',
  ]

  for (const selector of portfolioSelectors) {
    const items = $(selector)

    if (items.length > 0) {
      items.each((_index, element) => {
        const $item = $(element)

        // Extract title/name
        const name =
          $item.find('h2, h3, h4, .portfolio-title, .project-title').first().text().trim() ||
          $item.find('[data-field="name"]').text().trim()

        if (!name) return

        // Extract link
        const link =
          $item.find('a').first().attr('href') ||
          $item.find('[data-field="website-link"]').attr('href') ||
          ''

        // Extract image
        const image =
          $item.find('img').first().attr('src') ||
          $item.find('[data-field="preview-image"]').attr('src') ||
          ''

        // Extract description
        const description =
          $item.find('.portfolio-description, .project-description, .description')
            .first()
            .text()
            .trim() || ''

        // Extract tags
        const tags = $item.find('.tag, .tags').first().text().trim() || ''

        // Determine website type from tags or class
        const typeClass = Array.from(element.classList).find((c) =>
          ['corporate', 'ecommerce', 'landing', 'brand', 'other'].includes(c),
        )
        const websiteType = (typeClass as any) || 'other'

        portfolio.push({
          name,
          slug: toSlug(name),
          websiteLink: link,
          previewImage: image,
          description,
          websiteType,
          tags,
        })
      })

      // If we found portfolio items, break
      if (portfolio.length > 0) {
        break
      }
    }
  }

  return portfolio
}

// ============================================================
// URL EXTRACTION
// ============================================================

/**
 * Extract all image URLs from HTML
 */
export function extractImageUrls(html: string): string[] {
  const $ = cheerioLoad(html)
  const urls = new Set<string>()

  $('img').each((_index, element) => {
    const src = $(element).attr('src')
    const dataSrc = $(element).attr('data-src')
    const srcset = $(element).attr('srcset')

    if (src) urls.add(src)
    if (dataSrc) urls.add(dataSrc)
    if (srcset) {
      srcset.split(',').forEach((s) => {
        const url = s.trim().split(' ')[0]
        if (url) urls.add(url)
      })
    }
  })

  return Array.from(urls)
}

/**
 * Extract all media URLs from parsed data
 */
export function extractMediaUrls(data: WebflowExportData): string[] {
  const urls = new Set<string>()

  // From posts
  if (data.posts) {
    for (const post of data.posts) {
      if (post.featuredImage) urls.add(post.featuredImage)
    }
  }

  // From portfolio
  if (data.portfolio) {
    for (const item of data.portfolio) {
      if (item.previewImage) urls.add(item.previewImage)
    }
  }

  return Array.from(urls)
}