read/src/lib/epub.ts

import JSZip from "jszip";
import type { BookMetadata, EpubParseResult } from "../types";

// Parse EPUB file and extract text + metadata.
export async function parseEpub(file: File): Promise<EpubParseResult> {
  const zip = await JSZip.loadAsync(file);

  // Find the container.xml to get the content.opf path.
  const containerXml = await zip.file("META-INF/container.xml")?.async("text");
  if (!containerXml) throw new Error("Invalid EPUB: missing container.xml");

  // Parse container.xml to find rootfile path.
  const rootfileMatch = containerXml.match(/rootfile[^>]*full-path="([^"]+)"/);
  if (!rootfileMatch) throw new Error("Invalid EPUB: cannot find rootfile");

  const opfPath = rootfileMatch[1];
  const opfDir = opfPath.substring(0, opfPath.lastIndexOf("/") + 1);

  // Read the OPF file.
  const opfContent = await zip.file(opfPath)?.async("text");
  if (!opfContent) throw new Error("Invalid EPUB: cannot read OPF");

  // Extract metadata.
  const titleMatch = opfContent.match(/<dc:title[^>]*>([^<]+)<\/dc:title>/i);
  const authorMatch = opfContent.match(
    /<dc:creator[^>]*>([^<]+)<\/dc:creator>/i,
  );

  const metadata: BookMetadata = {
    title: titleMatch ? titleMatch[1].trim() : null,
    author: authorMatch ? authorMatch[1].trim() : null,
    cover: null,
  };

  // Find cover image - try multiple methods.
  // Method 1: Look for meta cover element.
  const metaCoverMatch = opfContent.match(
    /<meta[^>]*name="cover"[^>]*content="([^"]+)"/i,
  );
  // Method 2: Look for item with properties="cover-image".
  const coverImageMatch = opfContent.match(
    /<item[^>]*properties="cover-image"[^>]*href="([^"]+)"/i,
  );
  // Method 3: Look for item with id containing "cover" and image media-type.
  const coverIdMatch = opfContent.match(
    /<item[^>]*id="[^"]*cover[^"]*"[^>]*href="([^"]+)"[^>]*media-type="image\/[^"]+"/i,
  );
  // Method 4: Alternate format for cover-image property.
  const coverImageMatch2 = opfContent.match(
    /<item[^>]*href="([^"]+)"[^>]*properties="cover-image"/i,
  );

  let coverHref: string | null = null;
  if (coverImageMatch) {
    coverHref = coverImageMatch[1];
  } else if (coverImageMatch2) {
    coverHref = coverImageMatch2[1];
  } else if (metaCoverMatch) {
    // Need to find the href for this id.
    const coverId = metaCoverMatch[1];
    const itemMatch = opfContent.match(
      new RegExp(`<item[^>]*id="${coverId}"[^>]*href="([^"]+)"`, "i"),
    );
    if (itemMatch) coverHref = itemMatch[1];
  } else if (coverIdMatch) {
    coverHref = coverIdMatch[1];
  }

  // Load cover image if found (as base64 data URL for persistence).
  if (coverHref) {
    const coverPath = coverHref.startsWith("/")
      ? coverHref.slice(1)
      : opfDir + coverHref;
    const coverFile = zip.file(coverPath);
    if (coverFile) {
      const coverBase64 = await coverFile.async("base64");
      const mimeMatch = coverHref.match(/\.(jpe?g|png|gif|webp)$/i);
      const mimeType = mimeMatch
        ? `image/${mimeMatch[1].toLowerCase().replace("jpg", "jpeg")}`
        : "image/jpeg";
      metadata.cover = `data:${mimeType};base64,${coverBase64}`;
    }
  }

  // Get spine items (reading order).
  const spineMatches = [
    ...opfContent.matchAll(/<itemref[^>]*idref="([^"]+)"/g),
  ];
  const manifestMatches = [
    ...opfContent.matchAll(
      /<item[^>]*id="([^"]+)"[^>]*href="([^"]+)"[^>]*media-type="application\/xhtml\+xml"/g,
    ),
  ];

  // Also try alternate manifest format.
  const manifestMatches2 = [
    ...opfContent.matchAll(
      /<item[^>]*href="([^"]+)"[^>]*id="([^"]+)"[^>]*media-type="application\/xhtml\+xml"/g,
    ),
  ];

  // Build manifest map.
  const manifest: Record<string, string> = {};
  manifestMatches.forEach((m) => {
    manifest[m[1]] = m[2];
  });
  manifestMatches2.forEach((m) => {
    manifest[m[2]] = m[1];
  });

  // Get ordered content files.
  const contentFiles = spineMatches.map((m) => manifest[m[1]]).filter(Boolean);

  // If spine parsing failed, try to get all xhtml/html files.
  if (contentFiles.length === 0) {
    const allFiles = Object.keys(zip.files).filter(
      (f) => f.endsWith(".xhtml") || f.endsWith(".html") || f.endsWith(".htm"),
    );
    contentFiles.push(...allFiles);
  }

  // Extract text from each content file.
  let fullText = "";
  for (const href of contentFiles) {
    const filePath = href.startsWith("/") ? href.slice(1) : opfDir + href;
    const content = await zip.file(filePath)?.async("text");
    if (content) {
      // Strip HTML tags and get text.
      const textContent = content
        .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
        .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
        .replace(/<[^>]+>/g, " ")
        .replace(/&nbsp;/g, " ")
        .replace(/&amp;/g, "&")
        .replace(/&lt;/g, "<")
        .replace(/&gt;/g, ">")
        .replace(/&quot;/g, '"')
        .replace(/&#(\d+);/g, (_: string, n: string) =>
          String.fromCharCode(Number(n)),
        )
        .replace(/\s+/g, " ")
        .trim();
      if (textContent) {
        fullText += textContent + " ";
      }
    }
  }

  return { text: fullText.trim(), metadata };
}

// Fetch book metadata from Open Library API.
export async function fetchMetadataFromOpenLibrary(
  title: string | null,
  author: string | null,
): Promise<BookMetadata | null> {
  if (!title && !author) return null;

  try {
    const query = [title, author].filter(Boolean).join(" ");
    const url = `https://openlibrary.org/search.json?q=${encodeURIComponent(query)}&limit=1&fields=title,author_name,cover_i`;
    const response = await fetch(url);
    if (!response.ok) return null;

    const data: unknown = await response.json();
    const docs = (data as { docs?: Array<Record<string, unknown>> }).docs;
    if (!docs || docs.length === 0) return null;

    const book = docs[0] as {
      title?: string;
      author_name?: string[];
      cover_i?: number;
    };
    return {
      title: book.title || null,
      author: book.author_name?.[0] || null,
      cover: book.cover_i
        ? `https://covers.openlibrary.org/b/id/${book.cover_i}-M.jpg`
        : null,
    };
  } catch (e) {
    console.error("Failed to fetch from Open Library:", e);
    return null;
  }
}