Skip to content

Commit 4b38b39

Browse files
authored
chore Use pdf library to check file without extension (#115)
### Summary Instead of manually checking filename with `.pdf` extension and return is_pdf = false -> use currently pdf library to read file content to decide if the file content is valid pdf
1 parent 485ef30 commit 4b38b39

File tree

2 files changed

+36
-9
lines changed

2 files changed

+36
-9
lines changed

src/hooks/custom/utils/pdf.ts

+3-8
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,7 @@ export async function splitPdf(
9898
}
9999

100100
/**
101-
* Checks if the given file is a PDF. First it checks the `.pdf` file extension, then
102-
* it tries to load the file as a PDF using the `PDFDocument.load` method.
101+
* Checks if the given file is a PDF by loading the file as a PDF using the `PDFDocument.load` method.
103102
* @param file - The file to check.
104103
* @returns A promise that resolves to three values, first is a boolean representing
105104
* whether there was an error during PDF load, second is a PDFDocument object or null
@@ -109,8 +108,8 @@ export async function splitPdf(
109108
export async function loadPdf(
110109
file: File | null
111110
): Promise<[boolean, PDFDocument | null, number]> {
112-
if (!file?.name.endsWith(".pdf")) {
113-
console.info("Given file is not a PDF, so splitting is not enabled.");
111+
if (!file) {
112+
console.info("Given file is null, so splitting is not enabled.");
114113
return [true, null, 0];
115114
}
116115

@@ -120,10 +119,6 @@ export async function loadPdf(
120119
const pagesCount = pdf.getPages().length;
121120
return [false, pdf, pagesCount];
122121
} catch (e) {
123-
console.error(e);
124-
console.warn(
125-
"Attempted to interpret file as pdf, but error arose when splitting by pages. Reverting to non-split pdf handling path."
126-
);
127122
return [true, null, 0];
128123
}
129124
}

test/unit/utils/pdf.test.ts

+33-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ describe("Pdf utility functions", () => {
9797
});
9898

9999
describe("loadPdf", () => {
100-
it("should return true, null, and 0 if the file is not a PDF", async () => {
100+
it("should return true, null, and 0 if the file is null", async () => {
101101
const result = await loadPdf(null);
102102

103103
expect(result).toEqual([true, null, 0]);
@@ -115,6 +115,19 @@ describe("Pdf utility functions", () => {
115115
expect(file.content).not.toHaveBeenCalled();
116116
});
117117

118+
it("should return true, null, and 0 if the file is not a PDF without basing on file extension", async () => {
119+
const file = {
120+
name: "uuid1234",
121+
content: jest.fn().mockResolvedValue(new ArrayBuffer(0)),
122+
};
123+
124+
const result = await loadPdf(file as any);
125+
126+
expect(result).toEqual([true, null, 0]);
127+
expect(file.content).not.toHaveBeenCalled();
128+
});
129+
130+
118131
it("should return true, null, and 0 if there is an error while loading the PDF", async () => {
119132
const file = {
120133
name: "document.pdf",
@@ -143,5 +156,24 @@ describe("Pdf utility functions", () => {
143156
expect(loadMock).toHaveBeenCalledTimes(1);
144157
expect(loadMock).toHaveBeenCalledWith(f.arrayBuffer());
145158
});
159+
160+
it("should return false, PDFDocument object, and the number of pages if the PDF is loaded successfully without basing on file extension", async () => {
161+
const file = readFileSync("test/data/layout-parser-paper-fast.pdf");
162+
const f = {
163+
name: "uuid1234",
164+
arrayBuffer: () => file.buffer,
165+
};
166+
167+
jest.clearAllMocks(); // Reset Mocks Between Tests
168+
const loadMock = jest.spyOn(PDFDocument, "load");
169+
170+
const [error, _, pages] = await loadPdf(f as any);
171+
172+
expect(error).toBeFalsy();
173+
expect(pages).toEqual(2);
174+
expect(loadMock).toHaveBeenCalledTimes(1);
175+
expect(loadMock).toHaveBeenCalledWith(f.arrayBuffer());
176+
});
177+
146178
});
147179
});

0 commit comments

Comments
 (0)