mammoth.tests.js 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. var assert = require("assert");
  2. var path = require("path");
  3. var fs = require("fs");
  4. var _ = require("underscore");
  5. var mammoth = require("../");
  6. var promises = require("../lib/promises");
  7. var results = require("../lib/results");
  8. var testing = require("./testing");
  9. var test = require("./test")(module);
  10. var testData = testing.testData;
  11. var createFakeDocxFile = testing.createFakeDocxFile;
  12. test('should convert docx containing one paragraph to single p element', function() {
  13. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  14. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  15. assert.equal(result.value, "<p>Walking on imported air</p>");
  16. assert.deepEqual(result.messages, []);
  17. });
  18. });
  19. test('should convert docx represented by a Buffer', function() {
  20. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  21. return promises.nfcall(fs.readFile, docxPath)
  22. .then(function(buffer) {
  23. return mammoth.convertToHtml({buffer: buffer});
  24. })
  25. .then(function(result) {
  26. assert.equal(result.value, "<p>Walking on imported air</p>");
  27. assert.deepEqual(result.messages, []);
  28. });
  29. });
  30. test('should read docx xml files with unicode byte order mark', function() {
  31. var docxPath = path.join(__dirname, "test-data/utf8-bom.docx");
  32. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  33. assert.equal(result.value, "<p>This XML has a byte order mark.</p>");
  34. assert.deepEqual(result.messages, []);
  35. });
  36. });
  37. test('empty paragraphs are ignored by default', function() {
  38. var docxPath = path.join(__dirname, "test-data/empty.docx");
  39. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  40. assert.equal(result.value, "");
  41. assert.deepEqual(result.messages, []);
  42. });
  43. });
  44. test('empty paragraphs are preserved if ignoreEmptyParagraphs is false', function() {
  45. var docxPath = path.join(__dirname, "test-data/empty.docx");
  46. return mammoth.convertToHtml({path: docxPath}, {ignoreEmptyParagraphs: false}).then(function(result) {
  47. assert.equal(result.value, "<p></p>");
  48. assert.deepEqual(result.messages, []);
  49. });
  50. });
  51. test('style map can be expressed as string', function() {
  52. var docxFile = createFakeDocxFile({
  53. "word/document.xml": testData("simple/word/document.xml")
  54. });
  55. var options = {
  56. styleMap: "p => h1"
  57. };
  58. return mammoth.convertToHtml({file: docxFile}, options).then(function(result) {
  59. assert.equal("<h1>Hello.</h1>", result.value);
  60. });
  61. });
  62. test('style map can be expressed as array of style mappings', function() {
  63. var docxFile = createFakeDocxFile({
  64. "word/document.xml": testData("simple/word/document.xml")
  65. });
  66. var options = {
  67. styleMap: ["p => h1"]
  68. };
  69. return mammoth.convertToHtml({file: docxFile}, options).then(function(result) {
  70. assert.equal("<h1>Hello.</h1>", result.value);
  71. });
  72. });
  73. test('embedded style map is used if present', function() {
  74. var docxPath = path.join(__dirname, "test-data/embedded-style-map.docx");
  75. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  76. assert.equal(result.value, "<h1>Walking on imported air</h1>");
  77. assert.deepEqual(result.messages, []);
  78. });
  79. });
  80. test('explicit style map takes precedence over embedded style map', function() {
  81. var docxPath = path.join(__dirname, "test-data/embedded-style-map.docx");
  82. var options = {
  83. styleMap: ["p => p"]
  84. };
  85. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  86. assert.equal(result.value, "<p>Walking on imported air</p>");
  87. assert.deepEqual(result.messages, []);
  88. });
  89. });
  90. test('explicit style map is combined with embedded style map', function() {
  91. var docxPath = path.join(__dirname, "test-data/embedded-style-map.docx");
  92. var options = {
  93. styleMap: ["r => strong"]
  94. };
  95. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  96. assert.equal(result.value, "<h1><strong>Walking on imported air</strong></h1>");
  97. assert.deepEqual(result.messages, []);
  98. });
  99. });
  100. test('embedded style maps can be disabled', function() {
  101. var docxPath = path.join(__dirname, "test-data/embedded-style-map.docx");
  102. var options = {
  103. includeEmbeddedStyleMap: false
  104. };
  105. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  106. assert.equal(result.value, "<p>Walking on imported air</p>");
  107. assert.deepEqual(result.messages, []);
  108. });
  109. });
  110. test('embedded style map can be written using toBuffer() and then read', function() {
  111. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  112. return promises.nfcall(fs.readFile, docxPath)
  113. .then(function(buffer) {
  114. return mammoth.embedStyleMap({buffer: buffer}, "p => h1");
  115. })
  116. .then(function(docx) {
  117. var buffer = docx.toBuffer();
  118. assert.ok(Buffer.isBuffer(buffer));
  119. return mammoth.convertToHtml({buffer: buffer});
  120. })
  121. .then(function(result) {
  122. assert.equal(result.value, "<h1>Walking on imported air</h1>");
  123. assert.deepEqual(result.messages, []);
  124. });
  125. });
  126. test('embedded style map can be written using toArrayBuffer() and then read', function() {
  127. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  128. return promises.nfcall(fs.readFile, docxPath)
  129. .then(function(buffer) {
  130. return mammoth.embedStyleMap({buffer: buffer}, "p => h1");
  131. })
  132. .then(function(docx) {
  133. var arrayBuffer = docx.toArrayBuffer();
  134. assert.ok(!Buffer.isBuffer(arrayBuffer));
  135. return mammoth.convertToHtml({buffer: Buffer.from(arrayBuffer)});
  136. })
  137. .then(function(result) {
  138. assert.equal(result.value, "<h1>Walking on imported air</h1>");
  139. assert.deepEqual(result.messages, []);
  140. });
  141. });
  142. test('embedded style map can be retrieved', function() {
  143. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  144. return promises.nfcall(fs.readFile, docxPath)
  145. .then(function(buffer) {
  146. return mammoth.embedStyleMap({buffer: buffer}, "p => h1");
  147. })
  148. .then(function(docx) {
  149. return mammoth.readEmbeddedStyleMap({buffer: docx.toBuffer()});
  150. })
  151. .then(function(styleMap) {
  152. assert.equal(styleMap, "p => h1");
  153. });
  154. });
  155. test('warning if style mapping is not understood', function() {
  156. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  157. var options = {
  158. styleMap: "????\np => h1"
  159. };
  160. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  161. assert.equal("<h1>Walking on imported air</h1>", result.value);
  162. var warning = "Did not understand this style mapping, so ignored it: ????\n" +
  163. 'Error was at character number 1: Expected element type but got unrecognisedCharacter "?"';
  164. assert.deepEqual(result.messages, [results.warning(warning)]);
  165. });
  166. });
  167. test('options are passed to document converter when calling mammoth.convertToHtml', function() {
  168. var docxFile = createFakeDocxFile({
  169. "word/document.xml": testData("simple/word/document.xml")
  170. });
  171. var options = {
  172. styleMap: "p => h1"
  173. };
  174. return mammoth.convertToHtml({file: docxFile}, options).then(function(result) {
  175. assert.equal("<h1>Hello.</h1>", result.value);
  176. });
  177. });
  178. test('options.transformDocument is used to transform document if set', function() {
  179. var docxFile = createFakeDocxFile({
  180. "word/document.xml": testData("simple/word/document.xml")
  181. });
  182. var options = {
  183. transformDocument: function(document) {
  184. document.children[0].styleId = "Heading1";
  185. return document;
  186. }
  187. };
  188. return mammoth.convertToHtml({file: docxFile}, options).then(function(result) {
  189. assert.equal("<h1>Hello.</h1>", result.value);
  190. });
  191. });
  192. test('mammoth.transforms.paragraph only transforms paragraphs', function() {
  193. var docxFile = createFakeDocxFile({
  194. "word/document.xml": testData("simple/word/document.xml")
  195. });
  196. var options = {
  197. transformDocument: mammoth.transforms.paragraph(function(paragraph) {
  198. return _.extend(paragraph, {styleId: "Heading1"});
  199. })
  200. };
  201. return mammoth.convertToHtml({file: docxFile}, options).then(function(result) {
  202. assert.equal("<h1>Hello.</h1>", result.value);
  203. });
  204. });
  205. test('inline images referenced by path relative to part are included in output', function() {
  206. var docxPath = path.join(__dirname, "test-data/tiny-picture.docx");
  207. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  208. assert.equal(result.value, '<p><img src="" /></p>');
  209. });
  210. });
  211. test('inline images referenced by path relative to base are included in output', function() {
  212. var docxPath = path.join(__dirname, "test-data/tiny-picture-target-base-relative.docx");
  213. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  214. assert.equal(result.value, '<p><img src="" /></p>');
  215. });
  216. });
  217. test('src of inline images can be changed using read("base64")', function() {
  218. var docxPath = path.join(__dirname, "test-data/tiny-picture.docx");
  219. var convertImage = mammoth.images.imgElement(function(element) {
  220. return element.read("base64").then(function(encodedImage) {
  221. return {src: encodedImage.substring(0, 2) + "," + element.contentType};
  222. });
  223. });
  224. return mammoth.convertToHtml({path: docxPath}, {convertImage: convertImage}).then(function(result) {
  225. assert.deepEqual(result.messages, []);
  226. assert.equal(result.value, '<p><img src="iV,image/png" /></p>');
  227. });
  228. });
  229. test('src of inline images can be changed using readAsBase64String()', function() {
  230. var docxPath = path.join(__dirname, "test-data/tiny-picture.docx");
  231. var convertImage = mammoth.images.imgElement(function(element) {
  232. return element.readAsBase64String().then(function(encodedImage) {
  233. return {src: encodedImage.substring(0, 2) + "," + element.contentType};
  234. });
  235. });
  236. return mammoth.convertToHtml({path: docxPath}, {convertImage: convertImage}).then(function(result) {
  237. assert.deepEqual(result.messages, []);
  238. assert.equal(result.value, '<p><img src="iV,image/png" /></p>');
  239. });
  240. });
  241. test('src of inline images can be changed using readAsArrayBuffer()', function() {
  242. var docxPath = path.join(__dirname, "test-data/tiny-picture.docx");
  243. var convertImage = mammoth.images.imgElement(function(element) {
  244. return element.readAsArrayBuffer().then(function(arrayBuffer) {
  245. assert.ok(!Buffer.isBuffer(arrayBuffer));
  246. var encodedImage = Buffer.from(arrayBuffer).toString("base64");
  247. return {src: encodedImage.substring(0, 2) + "," + element.contentType};
  248. });
  249. });
  250. return mammoth.convertToHtml({path: docxPath}, {convertImage: convertImage}).then(function(result) {
  251. assert.deepEqual(result.messages, []);
  252. assert.equal(result.value, '<p><img src="iV,image/png" /></p>');
  253. });
  254. });
  255. test('src of inline images can be changed using read()', function() {
  256. var docxPath = path.join(__dirname, "test-data/tiny-picture.docx");
  257. var convertImage = mammoth.images.imgElement(function(element) {
  258. return element.read().then(function(buffer) {
  259. assert.ok(Buffer.isBuffer(buffer));
  260. var encodedImage = buffer.toString("base64");
  261. return {src: encodedImage.substring(0, 2) + "," + element.contentType};
  262. });
  263. });
  264. return mammoth.convertToHtml({path: docxPath}, {convertImage: convertImage}).then(function(result) {
  265. assert.deepEqual(result.messages, []);
  266. assert.equal(result.value, '<p><img src="iV,image/png" /></p>');
  267. });
  268. });
  269. test('src of inline images can be changed using readAsBuffer()', function() {
  270. var docxPath = path.join(__dirname, "test-data/tiny-picture.docx");
  271. var convertImage = mammoth.images.imgElement(function(element) {
  272. return element.readAsBuffer().then(function(buffer) {
  273. assert.ok(Buffer.isBuffer(buffer));
  274. var encodedImage = buffer.toString("base64");
  275. return {src: encodedImage.substring(0, 2) + "," + element.contentType};
  276. });
  277. });
  278. return mammoth.convertToHtml({path: docxPath}, {convertImage: convertImage}).then(function(result) {
  279. assert.deepEqual(result.messages, []);
  280. assert.equal(result.value, '<p><img src="iV,image/png" /></p>');
  281. });
  282. });
  283. test('images stored outside of document are included in output', function() {
  284. var docxPath = path.join(__dirname, "test-data/external-picture.docx");
  285. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  286. assert.equal(result.value, '<p><img src="" /></p>');
  287. assert.deepEqual(result.messages, []);
  288. });
  289. });
  290. test('error if images stored outside of document are specified when passing file without path', function() {
  291. var docxPath = path.join(__dirname, "test-data/external-picture.docx");
  292. var buffer = fs.readFileSync(docxPath);
  293. return mammoth.convertToHtml({buffer: buffer}).then(function(result) {
  294. assert.equal(result.value, '');
  295. assert.equal(result.messages[0].message, "could not find external image 'tiny-picture.png', path of input document is unknown");
  296. assert.equal(result.messages[0].type, "error");
  297. });
  298. });
  299. test('simple list is converted to list elements', function() {
  300. var docxPath = path.join(__dirname, "test-data/simple-list.docx");
  301. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  302. assert.equal(result.value, '<ul><li>Apple</li><li>Banana</li></ul>');
  303. });
  304. });
  305. test('word tables are converted to html tables', function() {
  306. var docxPath = path.join(__dirname, "test-data/tables.docx");
  307. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  308. var expectedHtml = "<p>Above</p>" +
  309. "<table>" +
  310. "<tr><td><p>Top left</p></td><td><p>Top right</p></td></tr>" +
  311. "<tr><td><p>Bottom left</p></td><td><p>Bottom right</p></td></tr>" +
  312. "</table>" +
  313. "<p>Below</p>";
  314. assert.equal(result.value, expectedHtml);
  315. assert.deepEqual(result.messages, []);
  316. });
  317. });
  318. test('footnotes are appended to text', function() {
  319. // TODO: don't duplicate footnotes with multiple references
  320. var docxPath = path.join(__dirname, "test-data/footnotes.docx");
  321. var options = {
  322. idPrefix: "doc-42-"
  323. };
  324. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  325. var expectedOutput = '<p>Ouch' +
  326. '<sup><a href="#doc-42-footnote-1" id="doc-42-footnote-ref-1">[1]</a></sup>.' +
  327. '<sup><a href="#doc-42-footnote-2" id="doc-42-footnote-ref-2">[2]</a></sup></p>' +
  328. '<ol><li id="doc-42-footnote-1"><p> A tachyon walks into a bar. <a href="#doc-42-footnote-ref-1">↑</a></p></li>' +
  329. '<li id="doc-42-footnote-2"><p> Fin. <a href="#doc-42-footnote-ref-2">↑</a></p></li></ol>';
  330. assert.equal(result.value, expectedOutput);
  331. assert.deepEqual(result.messages, []);
  332. });
  333. });
  334. test('endnotes are appended to text', function() {
  335. var docxPath = path.join(__dirname, "test-data/endnotes.docx");
  336. var options = {
  337. idPrefix: "doc-42-"
  338. };
  339. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  340. var expectedOutput = '<p>Ouch' +
  341. '<sup><a href="#doc-42-endnote-2" id="doc-42-endnote-ref-2">[1]</a></sup>.' +
  342. '<sup><a href="#doc-42-endnote-3" id="doc-42-endnote-ref-3">[2]</a></sup></p>' +
  343. '<ol><li id="doc-42-endnote-2"><p> A tachyon walks into a bar. <a href="#doc-42-endnote-ref-2">↑</a></p></li>' +
  344. '<li id="doc-42-endnote-3"><p> Fin. <a href="#doc-42-endnote-ref-3">↑</a></p></li></ol>';
  345. assert.equal(result.value, expectedOutput);
  346. assert.deepEqual(result.messages, []);
  347. });
  348. });
  349. test('relationships are handled properly in footnotes', function() {
  350. var docxPath = path.join(__dirname, "test-data/footnote-hyperlink.docx");
  351. var options = {
  352. idPrefix: "doc-42-"
  353. };
  354. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  355. var expectedOutput =
  356. '<p><sup><a href="#doc-42-footnote-1" id="doc-42-footnote-ref-1">[1]</a></sup></p>' +
  357. '<ol><li id="doc-42-footnote-1"><p> <a href="http://www.example.com">Example</a> <a href="#doc-42-footnote-ref-1">↑</a></p></li></ol>';
  358. assert.equal(result.value, expectedOutput);
  359. assert.deepEqual(result.messages, []);
  360. });
  361. });
  362. test('when style mapping is defined for comment references then comments are included', function() {
  363. var docxPath = path.join(__dirname, "test-data/comments.docx");
  364. var options = {
  365. idPrefix: "doc-42-",
  366. styleMap: "comment-reference => sup"
  367. };
  368. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  369. var expectedOutput = (
  370. '<p>Ouch' +
  371. '<sup><a href="#doc-42-comment-0" id="doc-42-comment-ref-0">[MW1]</a></sup>.' +
  372. '<sup><a href="#doc-42-comment-2" id="doc-42-comment-ref-2">[MW2]</a></sup></p>' +
  373. '<dl><dt id="doc-42-comment-0">Comment [MW1]</dt><dd><p>A tachyon walks into a bar. <a href="#doc-42-comment-ref-0">↑</a></p></dd>' +
  374. '<dt id="doc-42-comment-2">Comment [MW2]</dt><dd><p>Fin. <a href="#doc-42-comment-ref-2">↑</a></p></dd></dl>'
  375. );
  376. assert.equal(result.value, expectedOutput);
  377. assert.deepEqual(result.messages, []);
  378. });
  379. });
  380. test('textboxes are read', function() {
  381. var docxPath = path.join(__dirname, "test-data/text-box.docx");
  382. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  383. var expectedOutput = '<p>Datum plane</p>';
  384. assert.equal(result.value, expectedOutput);
  385. });
  386. });
  387. test('underline is ignored by default', function() {
  388. var docxPath = path.join(__dirname, "test-data/underline.docx");
  389. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  390. assert.equal(result.value, '<p><strong>The Sunset Tree</strong></p>');
  391. });
  392. });
  393. test('underline can be configured with style mapping', function() {
  394. var docxPath = path.join(__dirname, "test-data/underline.docx");
  395. return mammoth.convertToHtml({path: docxPath}, {styleMap: "u => em"}).then(function(result) {
  396. assert.equal(result.value, '<p><strong>The <em>Sunset</em> Tree</strong></p>');
  397. });
  398. });
  399. test('strikethrough is converted to <s> by default', function() {
  400. var docxPath = path.join(__dirname, "test-data/strikethrough.docx");
  401. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  402. assert.equal(result.value, "<p><s>Today's Special: Salmon</s> Sold out</p>");
  403. });
  404. });
  405. test('strikethrough conversion can be configured with style mappings', function() {
  406. var docxPath = path.join(__dirname, "test-data/strikethrough.docx");
  407. return mammoth.convertToHtml({path: docxPath}, {styleMap: "strike => del"}).then(function(result) {
  408. assert.equal(result.value, "<p><del>Today's Special: Salmon</del> Sold out</p>");
  409. });
  410. });
  411. test('indentation is used if prettyPrint is true', function() {
  412. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  413. return mammoth.convertToHtml({path: docxPath}, {prettyPrint: true}).then(function(result) {
  414. assert.equal(result.value, "<p>\n Walking on imported air\n</p>");
  415. assert.deepEqual(result.messages, []);
  416. });
  417. });
  418. test('using styleMapping throws error', function() {
  419. try {
  420. mammoth.styleMapping();
  421. } catch (error) {
  422. assert.equal(
  423. error.message,
  424. 'Use a raw string instead of mammoth.styleMapping e.g. "p[style-name=\'Title\'] => h1" instead of mammoth.styleMapping("p[style-name=\'Title\'] => h1")'
  425. );
  426. }
  427. });
  428. test('can convert single paragraph to markdown', function() {
  429. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  430. return mammoth.convertToMarkdown({path: docxPath}).then(function(result) {
  431. assert.equal(result.value, "Walking on imported air\n\n");
  432. assert.deepEqual(result.messages, []);
  433. });
  434. });
  435. test('extractRawText only retains raw text', function() {
  436. var docxPath = path.join(__dirname, "test-data/simple-list.docx");
  437. return mammoth.extractRawText({path: docxPath}).then(function(result) {
  438. assert.equal(result.value, 'Apple\n\nBanana\n\n');
  439. });
  440. });
  441. test('extractRawText can use .docx files represented by a Buffer', function() {
  442. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  443. return promises.nfcall(fs.readFile, docxPath)
  444. .then(function(buffer) {
  445. return mammoth.extractRawText({buffer: buffer});
  446. })
  447. .then(function(result) {
  448. assert.equal(result.value, "Walking on imported air\n\n");
  449. assert.deepEqual(result.messages, []);
  450. });
  451. });
  452. test('can read strict format', function() {
  453. var docxPath = path.join(__dirname, "test-data/strict-format.docx");
  454. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  455. assert.equal(result.value, "<p>Test</p>");
  456. assert.deepEqual(result.messages, []);
  457. });
  458. });
  459. test('should throw error if file is not a valid docx document', function() {
  460. var docxPath = path.join(__dirname, "test-data/empty.zip");
  461. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  462. assert.ok(false, "Expected error");
  463. }, function(error) {
  464. assert.equal(error.message, "Could not find main document part. Are you sure this is a valid .docx file?");
  465. });
  466. });