123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- const express = require("express");
- const multer = require("multer");
- const pdf = require("pdf-parse");
- const fs = require('fs');
- const app = express();
- app.use(express.json());
- app.use(express.urlencoded({extended: true}));
- app.use((req, res, next) => {
- res.setHeader("Access-Control-Allow-Origin", "*");
- res.setHeader("Access-Control-Allow-Headers", "Content-Type, x-requested-with");
- next();
- });
- const storage = multer.diskStorage({
- destination: function (req, file, cb) {
- cb(null, "uploads/");
- },
- filename: function (req, file, cb) {
- cb(null, file.originalname);
- }
- });
- const options = {
- normalizeWhitespace: true,
- pdfjsDataRangeTransportFactory: function () {
- return new pdfjsDataRangeTransport();
- }
- };
- function extractTextFromPDF(pdfUrl) {
- return new Promise((resolve, reject) => {
- // 读取PDF文件
- const dataBuffer = fs.readFileSync(pdfUrl);
- // 提取文本
- pdf(dataBuffer).then(data => {
- const numPages = data.numpages;
- const extractedText = [];
- // 遍历每一页
- for (let i = 1; i <= numPages; i++) {
- const page = data.pages[i - 1];
- // 获取当前页的文本内容
- let pageText = page.text;
- // 排除页眉和页脚的文本
- const header = '页眉的文本';
- const footer = '页脚的文本';
- const regex = new RegExp(`(${header}|${footer})`, 'g');
- pageText = pageText.replace(regex, '');
- extractedText.push(pageText);
- }
- resolve(extractedText.join(' '));
- }).catch(error => {
- reject(error);
- });
- });
- }
- const upload = multer({storage: storage});
- let hasPrinted = false;
- app.post("/upload", upload.single("file"), (req, res) => {
- const filePath = req.file.path;
- extractTextFromPDF(filePath).then(text => {
- console.log(text);
- }).catch(error => {
- console.error(error);
- });
- });
- // let dataBuffer = fs.readFileSync(filePath);
- // pdf(dataBuffer, options).then(function (data) {
- // if (!hasPrinted) {
- // console.log(data.text);
- // hasPrinted = true
- // }
- // });
- // } catch (error) {
- // res.status(500).json({error: error.message});
- // }
- // hasPrinted = false
- // });
- app.listen(3000, () => {
- console.log("Server started on port 3000");
- });
|