Refactor App into TS modules; add tsc typecheck

2026-02-07 20:30:56 +01:00
parent 11cfade6dc
commit 684a4dd0b8
22 changed files with 3254 additions and 1664 deletions
@@ -0,0 +1,186 @@
+import JSZip from "jszip";
+import type { BookMetadata, EpubParseResult } from "../types";
+
+// Parse EPUB file and extract text + metadata.
+export async function parseEpub(file: File): Promise<EpubParseResult> {
+  const zip = await JSZip.loadAsync(file);
+
+  // Find the container.xml to get the content.opf path.
+  const containerXml = await zip.file("META-INF/container.xml")?.async("text");
+  if (!containerXml) throw new Error("Invalid EPUB: missing container.xml");
+
+  // Parse container.xml to find rootfile path.
+  const rootfileMatch = containerXml.match(/rootfile[^>]*full-path="([^"]+)"/);
+  if (!rootfileMatch) throw new Error("Invalid EPUB: cannot find rootfile");
+
+  const opfPath = rootfileMatch[1];
+  const opfDir = opfPath.substring(0, opfPath.lastIndexOf("/") + 1);
+
+  // Read the OPF file.
+  const opfContent = await zip.file(opfPath)?.async("text");
+  if (!opfContent) throw new Error("Invalid EPUB: cannot read OPF");
+
+  // Extract metadata.
+  const titleMatch = opfContent.match(/<dc:title[^>]*>([^<]+)<\/dc:title>/i);
+  const authorMatch = opfContent.match(
+    /<dc:creator[^>]*>([^<]+)<\/dc:creator>/i,
+  );
+
+  const metadata: BookMetadata = {
+    title: titleMatch ? titleMatch[1].trim() : null,
+    author: authorMatch ? authorMatch[1].trim() : null,
+    cover: null,
+  };
+
+  // Find cover image - try multiple methods.
+  // Method 1: Look for meta cover element.
+  const metaCoverMatch = opfContent.match(
+    /<meta[^>]*name="cover"[^>]*content="([^"]+)"/i,
+  );
+  // Method 2: Look for item with properties="cover-image".
+  const coverImageMatch = opfContent.match(
+    /<item[^>]*properties="cover-image"[^>]*href="([^"]+)"/i,
+  );
+  // Method 3: Look for item with id containing "cover" and image media-type.
+  const coverIdMatch = opfContent.match(
+    /<item[^>]*id="[^"]*cover[^"]*"[^>]*href="([^"]+)"[^>]*media-type="image\/[^"]+"/i,
+  );
+  // Method 4: Alternate format for cover-image property.
+  const coverImageMatch2 = opfContent.match(
+    /<item[^>]*href="([^"]+)"[^>]*properties="cover-image"/i,
+  );
+
+  let coverHref: string | null = null;
+  if (coverImageMatch) {
+    coverHref = coverImageMatch[1];
+  } else if (coverImageMatch2) {
+    coverHref = coverImageMatch2[1];
+  } else if (metaCoverMatch) {
+    // Need to find the href for this id.
+    const coverId = metaCoverMatch[1];
+    const itemMatch = opfContent.match(
+      new RegExp(`<item[^>]*id="${coverId}"[^>]*href="([^"]+)"`, "i"),
+    );
+    if (itemMatch) coverHref = itemMatch[1];
+  } else if (coverIdMatch) {
+    coverHref = coverIdMatch[1];
+  }
+
+  // Load cover image if found (as base64 data URL for persistence).
+  if (coverHref) {
+    const coverPath = coverHref.startsWith("/")
+      ? coverHref.slice(1)
+      : opfDir + coverHref;
+    const coverFile = zip.file(coverPath);
+    if (coverFile) {
+      const coverBase64 = await coverFile.async("base64");
+      const mimeMatch = coverHref.match(/\.(jpe?g|png|gif|webp)$/i);
+      const mimeType = mimeMatch
+        ? `image/${mimeMatch[1].toLowerCase().replace("jpg", "jpeg")}`
+        : "image/jpeg";
+      metadata.cover = `data:${mimeType};base64,${coverBase64}`;
+    }
+  }
+
+  // Get spine items (reading order).
+  const spineMatches = [
+    ...opfContent.matchAll(/<itemref[^>]*idref="([^"]+)"/g),
+  ];
+  const manifestMatches = [
+    ...opfContent.matchAll(
+      /<item[^>]*id="([^"]+)"[^>]*href="([^"]+)"[^>]*media-type="application\/xhtml\+xml"/g,
+    ),
+  ];
+
+  // Also try alternate manifest format.
+  const manifestMatches2 = [
+    ...opfContent.matchAll(
+      /<item[^>]*href="([^"]+)"[^>]*id="([^"]+)"[^>]*media-type="application\/xhtml\+xml"/g,
+    ),
+  ];
+
+  // Build manifest map.
+  const manifest: Record<string, string> = {};
+  manifestMatches.forEach((m) => {
+    manifest[m[1]] = m[2];
+  });
+  manifestMatches2.forEach((m) => {
+    manifest[m[2]] = m[1];
+  });
+
+  // Get ordered content files.
+  const contentFiles = spineMatches.map((m) => manifest[m[1]]).filter(Boolean);
+
+  // If spine parsing failed, try to get all xhtml/html files.
+  if (contentFiles.length === 0) {
+    const allFiles = Object.keys(zip.files).filter(
+      (f) => f.endsWith(".xhtml") || f.endsWith(".html") || f.endsWith(".htm"),
+    );
+    contentFiles.push(...allFiles);
+  }
+
+  // Extract text from each content file.
+  let fullText = "";
+  for (const href of contentFiles) {
+    const filePath = href.startsWith("/") ? href.slice(1) : opfDir + href;
+    const content = await zip.file(filePath)?.async("text");
+    if (content) {
+      // Strip HTML tags and get text.
+      const textContent = content
+        .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
+        .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
+        .replace(/<[^>]+>/g, " ")
+        .replace(/&nbsp;/g, " ")
+        .replace(/&amp;/g, "&")
+        .replace(/&lt;/g, "<")
+        .replace(/&gt;/g, ">")
+        .replace(/&quot;/g, '"')
+        .replace(/&#(\d+);/g, (_: string, n: string) =>
+          String.fromCharCode(Number(n)),
+        )
+        .replace(/\s+/g, " ")
+        .trim();
+      if (textContent) {
+        fullText += textContent + " ";
+      }
+    }
+  }
+
+  return { text: fullText.trim(), metadata };
+}
+
+// Fetch book metadata from Open Library API.
+export async function fetchMetadataFromOpenLibrary(
+  title: string | null,
+  author: string | null,
+): Promise<BookMetadata | null> {
+  if (!title && !author) return null;
+
+  try {
+    const query = [title, author].filter(Boolean).join(" ");
+    const url = `https://openlibrary.org/search.json?q=${encodeURIComponent(query)}&limit=1&fields=title,author_name,cover_i`;
+    const response = await fetch(url);
+    if (!response.ok) return null;
+
+    const data: unknown = await response.json();
+    const docs = (data as { docs?: Array<Record<string, unknown>> }).docs;
+    if (!docs || docs.length === 0) return null;
+
+    const book = docs[0] as {
+      title?: string;
+      author_name?: string[];
+      cover_i?: number;
+    };
+    return {
+      title: book.title || null,
+      author: book.author_name?.[0] || null,
+      cover: book.cover_i
+        ? `https://covers.openlibrary.org/b/id/${book.cover_i}-M.jpg`
+        : null,
+    };
+  } catch (e) {
+    console.error("Failed to fetch from Open Library:", e);
+    return null;
+  }
+}
+