PDFToText.js 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. const express = require("express");
  2. const multer = require("multer");
  3. const pdf = require("pdf-parse");
  4. const fs = require('fs');
  5. const app = express();
  6. app.use(express.json());
  7. app.use(express.urlencoded({extended: true}));
  8. app.use((req, res, next) => {
  9. res.setHeader("Access-Control-Allow-Origin", "*");
  10. res.setHeader("Access-Control-Allow-Headers", "Content-Type, x-requested-with");
  11. next();
  12. });
  13. const storage = multer.diskStorage({
  14. destination: function (req, file, cb) {
  15. cb(null, "uploads/");
  16. },
  17. filename: function (req, file, cb) {
  18. cb(null, file.originalname);
  19. }
  20. });
  21. const options = {
  22. normalizeWhitespace: true,
  23. pdfjsDataRangeTransportFactory: function () {
  24. return new pdfjsDataRangeTransport();
  25. }
  26. };
  27. function extractTextFromPDF(pdfUrl) {
  28. return new Promise((resolve, reject) => {
  29. // 读取PDF文件
  30. const dataBuffer = fs.readFileSync(pdfUrl);
  31. // 提取文本
  32. pdf(dataBuffer).then(data => {
  33. const numPages = data.numpages;
  34. const extractedText = [];
  35. // 遍历每一页
  36. for (let i = 1; i <= numPages; i++) {
  37. const page = data.pages[i - 1];
  38. // 获取当前页的文本内容
  39. let pageText = page.text;
  40. // 排除页眉和页脚的文本
  41. const header = '页眉的文本';
  42. const footer = '页脚的文本';
  43. const regex = new RegExp(`(${header}|${footer})`, 'g');
  44. pageText = pageText.replace(regex, '');
  45. extractedText.push(pageText);
  46. }
  47. resolve(extractedText.join(' '));
  48. }).catch(error => {
  49. reject(error);
  50. });
  51. });
  52. }
  53. const upload = multer({storage: storage});
  54. let hasPrinted = false;
  55. app.post("/upload", upload.single("file"), (req, res) => {
  56. const filePath = req.file.path;
  57. extractTextFromPDF(filePath).then(text => {
  58. console.log(text);
  59. }).catch(error => {
  60. console.error(error);
  61. });
  62. });
  63. // let dataBuffer = fs.readFileSync(filePath);
  64. // pdf(dataBuffer, options).then(function (data) {
  65. // if (!hasPrinted) {
  66. // console.log(data.text);
  67. // hasPrinted = true
  68. // }
  69. // });
  70. // } catch (error) {
  71. // res.status(500).json({error: error.message});
  72. // }
  73. // hasPrinted = false
  74. // });
  75. app.listen(3000, () => {
  76. console.log("Server started on port 3000");
  77. });