187 lines
6.1 KiB
TypeScript
187 lines
6.1 KiB
TypeScript
import JSZip from "jszip";
|
|
import type { BookMetadata, EpubParseResult } from "../types";
|
|
|
|
// Parse EPUB file and extract text + metadata.
|
|
export async function parseEpub(file: File): Promise<EpubParseResult> {
|
|
const zip = await JSZip.loadAsync(file);
|
|
|
|
// Find the container.xml to get the content.opf path.
|
|
const containerXml = await zip.file("META-INF/container.xml")?.async("text");
|
|
if (!containerXml) throw new Error("Invalid EPUB: missing container.xml");
|
|
|
|
// Parse container.xml to find rootfile path.
|
|
const rootfileMatch = containerXml.match(/rootfile[^>]*full-path="([^"]+)"/);
|
|
if (!rootfileMatch) throw new Error("Invalid EPUB: cannot find rootfile");
|
|
|
|
const opfPath = rootfileMatch[1];
|
|
const opfDir = opfPath.substring(0, opfPath.lastIndexOf("/") + 1);
|
|
|
|
// Read the OPF file.
|
|
const opfContent = await zip.file(opfPath)?.async("text");
|
|
if (!opfContent) throw new Error("Invalid EPUB: cannot read OPF");
|
|
|
|
// Extract metadata.
|
|
const titleMatch = opfContent.match(/<dc:title[^>]*>([^<]+)<\/dc:title>/i);
|
|
const authorMatch = opfContent.match(
|
|
/<dc:creator[^>]*>([^<]+)<\/dc:creator>/i,
|
|
);
|
|
|
|
const metadata: BookMetadata = {
|
|
title: titleMatch ? titleMatch[1].trim() : null,
|
|
author: authorMatch ? authorMatch[1].trim() : null,
|
|
cover: null,
|
|
};
|
|
|
|
// Find cover image - try multiple methods.
|
|
// Method 1: Look for meta cover element.
|
|
const metaCoverMatch = opfContent.match(
|
|
/<meta[^>]*name="cover"[^>]*content="([^"]+)"/i,
|
|
);
|
|
// Method 2: Look for item with properties="cover-image".
|
|
const coverImageMatch = opfContent.match(
|
|
/<item[^>]*properties="cover-image"[^>]*href="([^"]+)"/i,
|
|
);
|
|
// Method 3: Look for item with id containing "cover" and image media-type.
|
|
const coverIdMatch = opfContent.match(
|
|
/<item[^>]*id="[^"]*cover[^"]*"[^>]*href="([^"]+)"[^>]*media-type="image\/[^"]+"/i,
|
|
);
|
|
// Method 4: Alternate format for cover-image property.
|
|
const coverImageMatch2 = opfContent.match(
|
|
/<item[^>]*href="([^"]+)"[^>]*properties="cover-image"/i,
|
|
);
|
|
|
|
let coverHref: string | null = null;
|
|
if (coverImageMatch) {
|
|
coverHref = coverImageMatch[1];
|
|
} else if (coverImageMatch2) {
|
|
coverHref = coverImageMatch2[1];
|
|
} else if (metaCoverMatch) {
|
|
// Need to find the href for this id.
|
|
const coverId = metaCoverMatch[1];
|
|
const itemMatch = opfContent.match(
|
|
new RegExp(`<item[^>]*id="${coverId}"[^>]*href="([^"]+)"`, "i"),
|
|
);
|
|
if (itemMatch) coverHref = itemMatch[1];
|
|
} else if (coverIdMatch) {
|
|
coverHref = coverIdMatch[1];
|
|
}
|
|
|
|
// Load cover image if found (as base64 data URL for persistence).
|
|
if (coverHref) {
|
|
const coverPath = coverHref.startsWith("/")
|
|
? coverHref.slice(1)
|
|
: opfDir + coverHref;
|
|
const coverFile = zip.file(coverPath);
|
|
if (coverFile) {
|
|
const coverBase64 = await coverFile.async("base64");
|
|
const mimeMatch = coverHref.match(/\.(jpe?g|png|gif|webp)$/i);
|
|
const mimeType = mimeMatch
|
|
? `image/${mimeMatch[1].toLowerCase().replace("jpg", "jpeg")}`
|
|
: "image/jpeg";
|
|
metadata.cover = `data:${mimeType};base64,${coverBase64}`;
|
|
}
|
|
}
|
|
|
|
// Get spine items (reading order).
|
|
const spineMatches = [
|
|
...opfContent.matchAll(/<itemref[^>]*idref="([^"]+)"/g),
|
|
];
|
|
const manifestMatches = [
|
|
...opfContent.matchAll(
|
|
/<item[^>]*id="([^"]+)"[^>]*href="([^"]+)"[^>]*media-type="application\/xhtml\+xml"/g,
|
|
),
|
|
];
|
|
|
|
// Also try alternate manifest format.
|
|
const manifestMatches2 = [
|
|
...opfContent.matchAll(
|
|
/<item[^>]*href="([^"]+)"[^>]*id="([^"]+)"[^>]*media-type="application\/xhtml\+xml"/g,
|
|
),
|
|
];
|
|
|
|
// Build manifest map.
|
|
const manifest: Record<string, string> = {};
|
|
manifestMatches.forEach((m) => {
|
|
manifest[m[1]] = m[2];
|
|
});
|
|
manifestMatches2.forEach((m) => {
|
|
manifest[m[2]] = m[1];
|
|
});
|
|
|
|
// Get ordered content files.
|
|
const contentFiles = spineMatches.map((m) => manifest[m[1]]).filter(Boolean);
|
|
|
|
// If spine parsing failed, try to get all xhtml/html files.
|
|
if (contentFiles.length === 0) {
|
|
const allFiles = Object.keys(zip.files).filter(
|
|
(f) => f.endsWith(".xhtml") || f.endsWith(".html") || f.endsWith(".htm"),
|
|
);
|
|
contentFiles.push(...allFiles);
|
|
}
|
|
|
|
// Extract text from each content file.
|
|
let fullText = "";
|
|
for (const href of contentFiles) {
|
|
const filePath = href.startsWith("/") ? href.slice(1) : opfDir + href;
|
|
const content = await zip.file(filePath)?.async("text");
|
|
if (content) {
|
|
// Strip HTML tags and get text.
|
|
const textContent = content
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
|
|
.replace(/<[^>]+>/g, " ")
|
|
.replace(/ /g, " ")
|
|
.replace(/&/g, "&")
|
|
.replace(/</g, "<")
|
|
.replace(/>/g, ">")
|
|
.replace(/"/g, '"')
|
|
.replace(/&#(\d+);/g, (_: string, n: string) =>
|
|
String.fromCharCode(Number(n)),
|
|
)
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
if (textContent) {
|
|
fullText += textContent + " ";
|
|
}
|
|
}
|
|
}
|
|
|
|
return { text: fullText.trim(), metadata };
|
|
}
|
|
|
|
// Fetch book metadata from Open Library API.
|
|
export async function fetchMetadataFromOpenLibrary(
|
|
title: string | null,
|
|
author: string | null,
|
|
): Promise<BookMetadata | null> {
|
|
if (!title && !author) return null;
|
|
|
|
try {
|
|
const query = [title, author].filter(Boolean).join(" ");
|
|
const url = `https://openlibrary.org/search.json?q=${encodeURIComponent(query)}&limit=1&fields=title,author_name,cover_i`;
|
|
const response = await fetch(url);
|
|
if (!response.ok) return null;
|
|
|
|
const data: unknown = await response.json();
|
|
const docs = (data as { docs?: Array<Record<string, unknown>> }).docs;
|
|
if (!docs || docs.length === 0) return null;
|
|
|
|
const book = docs[0] as {
|
|
title?: string;
|
|
author_name?: string[];
|
|
cover_i?: number;
|
|
};
|
|
return {
|
|
title: book.title || null,
|
|
author: book.author_name?.[0] || null,
|
|
cover: book.cover_i
|
|
? `https://covers.openlibrary.org/b/id/${book.cover_i}-M.jpg`
|
|
: null,
|
|
};
|
|
} catch (e) {
|
|
console.error("Failed to fetch from Open Library:", e);
|
|
return null;
|
|
}
|
|
}
|
|
|