const express = require("express"); const multer = require("multer"); const pdf = require("pdf-parse"); const fs = require('fs'); const app = express(); app.use(express.json()); app.use(express.urlencoded({extended: true})); app.use((req, res, next) => { res.setHeader("Access-Control-Allow-Origin", "*"); res.setHeader("Access-Control-Allow-Headers", "Content-Type, x-requested-with"); next(); }); const storage = multer.diskStorage({ destination: function (req, file, cb) { cb(null, "uploads/"); }, filename: function (req, file, cb) { cb(null, file.originalname); } }); const options = { normalizeWhitespace: true, pdfjsDataRangeTransportFactory: function () { return new pdfjsDataRangeTransport(); } }; function extractTextFromPDF(pdfUrl) { return new Promise((resolve, reject) => { // 读取PDF文件 const dataBuffer = fs.readFileSync(pdfUrl); // 提取文本 pdf(dataBuffer).then(data => { const numPages = data.numpages; const extractedText = []; // 遍历每一页 for (let i = 1; i <= numPages; i++) { const page = data.pages[i - 1]; // 获取当前页的文本内容 let pageText = page.text; // 排除页眉和页脚的文本 const header = '页眉的文本'; const footer = '页脚的文本'; const regex = new RegExp(`(${header}|${footer})`, 'g'); pageText = pageText.replace(regex, ''); extractedText.push(pageText); } resolve(extractedText.join(' ')); }).catch(error => { reject(error); }); }); } const upload = multer({storage: storage}); let hasPrinted = false; app.post("/upload", upload.single("file"), (req, res) => { const filePath = req.file.path; extractTextFromPDF(filePath).then(text => { console.log(text); }).catch(error => { console.error(error); }); }); // let dataBuffer = fs.readFileSync(filePath); // pdf(dataBuffer, options).then(function (data) { // if (!hasPrinted) { // console.log(data.text); // hasPrinted = true // } // }); // } catch (error) { // res.status(500).json({error: error.message}); // } // hasPrinted = false // }); app.listen(3000, () => { console.log("Server started on port 3000"); });