import JSZip from "jszip"; import type { BookMetadata, EpubParseResult } from "../types"; // Parse EPUB file and extract text + metadata. export async function parseEpub(file: File): Promise { const zip = await JSZip.loadAsync(file); // Find the container.xml to get the content.opf path. const containerXml = await zip.file("META-INF/container.xml")?.async("text"); if (!containerXml) throw new Error("Invalid EPUB: missing container.xml"); // Parse container.xml to find rootfile path. const rootfileMatch = containerXml.match(/rootfile[^>]*full-path="([^"]+)"/); if (!rootfileMatch) throw new Error("Invalid EPUB: cannot find rootfile"); const opfPath = rootfileMatch[1]; const opfDir = opfPath.substring(0, opfPath.lastIndexOf("/") + 1); // Read the OPF file. const opfContent = await zip.file(opfPath)?.async("text"); if (!opfContent) throw new Error("Invalid EPUB: cannot read OPF"); // Extract metadata. const titleMatch = opfContent.match(/]*>([^<]+)<\/dc:title>/i); const authorMatch = opfContent.match( /]*>([^<]+)<\/dc:creator>/i, ); const metadata: BookMetadata = { title: titleMatch ? titleMatch[1].trim() : null, author: authorMatch ? authorMatch[1].trim() : null, cover: null, }; // Find cover image - try multiple methods. // Method 1: Look for meta cover element. const metaCoverMatch = opfContent.match( /]*name="cover"[^>]*content="([^"]+)"/i, ); // Method 2: Look for item with properties="cover-image". const coverImageMatch = opfContent.match( /]*properties="cover-image"[^>]*href="([^"]+)"/i, ); // Method 3: Look for item with id containing "cover" and image media-type. const coverIdMatch = opfContent.match( /]*id="[^"]*cover[^"]*"[^>]*href="([^"]+)"[^>]*media-type="image\/[^"]+"/i, ); // Method 4: Alternate format for cover-image property. const coverImageMatch2 = opfContent.match( /]*href="([^"]+)"[^>]*properties="cover-image"/i, ); let coverHref: string | null = null; if (coverImageMatch) { coverHref = coverImageMatch[1]; } else if (coverImageMatch2) { coverHref = coverImageMatch2[1]; } else if (metaCoverMatch) { // Need to find the href for this id. const coverId = metaCoverMatch[1]; const itemMatch = opfContent.match( new RegExp(`]*id="${coverId}"[^>]*href="([^"]+)"`, "i"), ); if (itemMatch) coverHref = itemMatch[1]; } else if (coverIdMatch) { coverHref = coverIdMatch[1]; } // Load cover image if found (as base64 data URL for persistence). if (coverHref) { const coverPath = coverHref.startsWith("/") ? coverHref.slice(1) : opfDir + coverHref; const coverFile = zip.file(coverPath); if (coverFile) { const coverBase64 = await coverFile.async("base64"); const mimeMatch = coverHref.match(/\.(jpe?g|png|gif|webp)$/i); const mimeType = mimeMatch ? `image/${mimeMatch[1].toLowerCase().replace("jpg", "jpeg")}` : "image/jpeg"; metadata.cover = `data:${mimeType};base64,${coverBase64}`; } } // Get spine items (reading order). const spineMatches = [ ...opfContent.matchAll(/]*idref="([^"]+)"/g), ]; const manifestMatches = [ ...opfContent.matchAll( /]*id="([^"]+)"[^>]*href="([^"]+)"[^>]*media-type="application\/xhtml\+xml"/g, ), ]; // Also try alternate manifest format. const manifestMatches2 = [ ...opfContent.matchAll( /]*href="([^"]+)"[^>]*id="([^"]+)"[^>]*media-type="application\/xhtml\+xml"/g, ), ]; // Build manifest map. const manifest: Record = {}; manifestMatches.forEach((m) => { manifest[m[1]] = m[2]; }); manifestMatches2.forEach((m) => { manifest[m[2]] = m[1]; }); // Get ordered content files. const contentFiles = spineMatches.map((m) => manifest[m[1]]).filter(Boolean); // If spine parsing failed, try to get all xhtml/html files. if (contentFiles.length === 0) { const allFiles = Object.keys(zip.files).filter( (f) => f.endsWith(".xhtml") || f.endsWith(".html") || f.endsWith(".htm"), ); contentFiles.push(...allFiles); } // Extract text from each content file. let fullText = ""; for (const href of contentFiles) { const filePath = href.startsWith("/") ? href.slice(1) : opfDir + href; const content = await zip.file(filePath)?.async("text"); if (content) { // Strip HTML tags and get text. const textContent = content .replace(/]*>[\s\S]*?<\/script>/gi, "") .replace(/]*>[\s\S]*?<\/style>/gi, "") .replace(/<[^>]+>/g, " ") .replace(/ /g, " ") .replace(/&/g, "&") .replace(/</g, "<") .replace(/>/g, ">") .replace(/"/g, '"') .replace(/&#(\d+);/g, (_: string, n: string) => String.fromCharCode(Number(n)), ) .replace(/\s+/g, " ") .trim(); if (textContent) { fullText += textContent + " "; } } } return { text: fullText.trim(), metadata }; } // Fetch book metadata from Open Library API. export async function fetchMetadataFromOpenLibrary( title: string | null, author: string | null, ): Promise { if (!title && !author) return null; try { const query = [title, author].filter(Boolean).join(" "); const url = `https://openlibrary.org/search.json?q=${encodeURIComponent(query)}&limit=1&fields=title,author_name,cover_i`; const response = await fetch(url); if (!response.ok) return null; const data: unknown = await response.json(); const docs = (data as { docs?: Array> }).docs; if (!docs || docs.length === 0) return null; const book = docs[0] as { title?: string; author_name?: string[]; cover_i?: number; }; return { title: book.title || null, author: book.author_name?.[0] || null, cover: book.cover_i ? `https://covers.openlibrary.org/b/id/${book.cover_i}-M.jpg` : null, }; } catch (e) { console.error("Failed to fetch from Open Library:", e); return null; } }