Refactor App into TS modules; add tsc typecheck
This commit is contained in:
+186
@@ -0,0 +1,186 @@
|
||||
import JSZip from "jszip";
|
||||
import type { BookMetadata, EpubParseResult } from "../types";
|
||||
|
||||
// Parse EPUB file and extract text + metadata.
|
||||
export async function parseEpub(file: File): Promise<EpubParseResult> {
|
||||
const zip = await JSZip.loadAsync(file);
|
||||
|
||||
// Find the container.xml to get the content.opf path.
|
||||
const containerXml = await zip.file("META-INF/container.xml")?.async("text");
|
||||
if (!containerXml) throw new Error("Invalid EPUB: missing container.xml");
|
||||
|
||||
// Parse container.xml to find rootfile path.
|
||||
const rootfileMatch = containerXml.match(/rootfile[^>]*full-path="([^"]+)"/);
|
||||
if (!rootfileMatch) throw new Error("Invalid EPUB: cannot find rootfile");
|
||||
|
||||
const opfPath = rootfileMatch[1];
|
||||
const opfDir = opfPath.substring(0, opfPath.lastIndexOf("/") + 1);
|
||||
|
||||
// Read the OPF file.
|
||||
const opfContent = await zip.file(opfPath)?.async("text");
|
||||
if (!opfContent) throw new Error("Invalid EPUB: cannot read OPF");
|
||||
|
||||
// Extract metadata.
|
||||
const titleMatch = opfContent.match(/<dc:title[^>]*>([^<]+)<\/dc:title>/i);
|
||||
const authorMatch = opfContent.match(
|
||||
/<dc:creator[^>]*>([^<]+)<\/dc:creator>/i,
|
||||
);
|
||||
|
||||
const metadata: BookMetadata = {
|
||||
title: titleMatch ? titleMatch[1].trim() : null,
|
||||
author: authorMatch ? authorMatch[1].trim() : null,
|
||||
cover: null,
|
||||
};
|
||||
|
||||
// Find cover image - try multiple methods.
|
||||
// Method 1: Look for meta cover element.
|
||||
const metaCoverMatch = opfContent.match(
|
||||
/<meta[^>]*name="cover"[^>]*content="([^"]+)"/i,
|
||||
);
|
||||
// Method 2: Look for item with properties="cover-image".
|
||||
const coverImageMatch = opfContent.match(
|
||||
/<item[^>]*properties="cover-image"[^>]*href="([^"]+)"/i,
|
||||
);
|
||||
// Method 3: Look for item with id containing "cover" and image media-type.
|
||||
const coverIdMatch = opfContent.match(
|
||||
/<item[^>]*id="[^"]*cover[^"]*"[^>]*href="([^"]+)"[^>]*media-type="image\/[^"]+"/i,
|
||||
);
|
||||
// Method 4: Alternate format for cover-image property.
|
||||
const coverImageMatch2 = opfContent.match(
|
||||
/<item[^>]*href="([^"]+)"[^>]*properties="cover-image"/i,
|
||||
);
|
||||
|
||||
let coverHref: string | null = null;
|
||||
if (coverImageMatch) {
|
||||
coverHref = coverImageMatch[1];
|
||||
} else if (coverImageMatch2) {
|
||||
coverHref = coverImageMatch2[1];
|
||||
} else if (metaCoverMatch) {
|
||||
// Need to find the href for this id.
|
||||
const coverId = metaCoverMatch[1];
|
||||
const itemMatch = opfContent.match(
|
||||
new RegExp(`<item[^>]*id="${coverId}"[^>]*href="([^"]+)"`, "i"),
|
||||
);
|
||||
if (itemMatch) coverHref = itemMatch[1];
|
||||
} else if (coverIdMatch) {
|
||||
coverHref = coverIdMatch[1];
|
||||
}
|
||||
|
||||
// Load cover image if found (as base64 data URL for persistence).
|
||||
if (coverHref) {
|
||||
const coverPath = coverHref.startsWith("/")
|
||||
? coverHref.slice(1)
|
||||
: opfDir + coverHref;
|
||||
const coverFile = zip.file(coverPath);
|
||||
if (coverFile) {
|
||||
const coverBase64 = await coverFile.async("base64");
|
||||
const mimeMatch = coverHref.match(/\.(jpe?g|png|gif|webp)$/i);
|
||||
const mimeType = mimeMatch
|
||||
? `image/${mimeMatch[1].toLowerCase().replace("jpg", "jpeg")}`
|
||||
: "image/jpeg";
|
||||
metadata.cover = `data:${mimeType};base64,${coverBase64}`;
|
||||
}
|
||||
}
|
||||
|
||||
// Get spine items (reading order).
|
||||
const spineMatches = [
|
||||
...opfContent.matchAll(/<itemref[^>]*idref="([^"]+)"/g),
|
||||
];
|
||||
const manifestMatches = [
|
||||
...opfContent.matchAll(
|
||||
/<item[^>]*id="([^"]+)"[^>]*href="([^"]+)"[^>]*media-type="application\/xhtml\+xml"/g,
|
||||
),
|
||||
];
|
||||
|
||||
// Also try alternate manifest format.
|
||||
const manifestMatches2 = [
|
||||
...opfContent.matchAll(
|
||||
/<item[^>]*href="([^"]+)"[^>]*id="([^"]+)"[^>]*media-type="application\/xhtml\+xml"/g,
|
||||
),
|
||||
];
|
||||
|
||||
// Build manifest map.
|
||||
const manifest: Record<string, string> = {};
|
||||
manifestMatches.forEach((m) => {
|
||||
manifest[m[1]] = m[2];
|
||||
});
|
||||
manifestMatches2.forEach((m) => {
|
||||
manifest[m[2]] = m[1];
|
||||
});
|
||||
|
||||
// Get ordered content files.
|
||||
const contentFiles = spineMatches.map((m) => manifest[m[1]]).filter(Boolean);
|
||||
|
||||
// If spine parsing failed, try to get all xhtml/html files.
|
||||
if (contentFiles.length === 0) {
|
||||
const allFiles = Object.keys(zip.files).filter(
|
||||
(f) => f.endsWith(".xhtml") || f.endsWith(".html") || f.endsWith(".htm"),
|
||||
);
|
||||
contentFiles.push(...allFiles);
|
||||
}
|
||||
|
||||
// Extract text from each content file.
|
||||
let fullText = "";
|
||||
for (const href of contentFiles) {
|
||||
const filePath = href.startsWith("/") ? href.slice(1) : opfDir + href;
|
||||
const content = await zip.file(filePath)?.async("text");
|
||||
if (content) {
|
||||
// Strip HTML tags and get text.
|
||||
const textContent = content
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
|
||||
.replace(/<[^>]+>/g, " ")
|
||||
.replace(/ /g, " ")
|
||||
.replace(/&/g, "&")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.replace(/"/g, '"')
|
||||
.replace(/&#(\d+);/g, (_: string, n: string) =>
|
||||
String.fromCharCode(Number(n)),
|
||||
)
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
if (textContent) {
|
||||
fullText += textContent + " ";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { text: fullText.trim(), metadata };
|
||||
}
|
||||
|
||||
// Fetch book metadata from Open Library API.
|
||||
export async function fetchMetadataFromOpenLibrary(
|
||||
title: string | null,
|
||||
author: string | null,
|
||||
): Promise<BookMetadata | null> {
|
||||
if (!title && !author) return null;
|
||||
|
||||
try {
|
||||
const query = [title, author].filter(Boolean).join(" ");
|
||||
const url = `https://openlibrary.org/search.json?q=${encodeURIComponent(query)}&limit=1&fields=title,author_name,cover_i`;
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) return null;
|
||||
|
||||
const data: unknown = await response.json();
|
||||
const docs = (data as { docs?: Array<Record<string, unknown>> }).docs;
|
||||
if (!docs || docs.length === 0) return null;
|
||||
|
||||
const book = docs[0] as {
|
||||
title?: string;
|
||||
author_name?: string[];
|
||||
cover_i?: number;
|
||||
};
|
||||
return {
|
||||
title: book.title || null,
|
||||
author: book.author_name?.[0] || null,
|
||||
cover: book.cover_i
|
||||
? `https://covers.openlibrary.org/b/id/${book.cover_i}-M.jpg`
|
||||
: null,
|
||||
};
|
||||
} catch (e) {
|
||||
console.error("Failed to fetch from Open Library:", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user