123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687 |
- exports.createBodyReader = createBodyReader;
- exports._readNumberingProperties = readNumberingProperties;
- var dingbatToUnicode = require("dingbat-to-unicode");
- var _ = require("underscore");
- var documents = require("../documents");
- var Result = require("../results").Result;
- var warning = require("../results").warning;
- var uris = require("./uris");
- function createBodyReader(options) {
- return {
- readXmlElement: function(element) {
- return new BodyReader(options).readXmlElement(element);
- },
- readXmlElements: function(elements) {
- return new BodyReader(options).readXmlElements(elements);
- }
- };
- }
- function BodyReader(options) {
- var complexFieldStack = [];
- var currentInstrText = [];
- // When a paragraph is marked as deleted, its contents should be combined
- // with the following paragraph. See 17.13.5.15 del (Deleted Paragraph) of
- // ECMA-376 4th edition Part 1.
- var deletedParagraphContents = [];
- var relationships = options.relationships;
- var contentTypes = options.contentTypes;
- var docxFile = options.docxFile;
- var files = options.files;
- var numbering = options.numbering;
- var styles = options.styles;
- function readXmlElements(elements) {
- var results = elements.map(readXmlElement);
- return combineResults(results);
- }
- function readXmlElement(element) {
- if (element.type === "element") {
- var handler = xmlElementReaders[element.name];
- if (handler) {
- return handler(element);
- } else if (!Object.prototype.hasOwnProperty.call(ignoreElements, element.name)) {
- var message = warning("An unrecognised element was ignored: " + element.name);
- return emptyResultWithMessages([message]);
- }
- }
- return emptyResult();
- }
- function readParagraphProperties(element) {
- return readParagraphStyle(element).map(function(style) {
- return {
- type: "paragraphProperties",
- styleId: style.styleId,
- styleName: style.name,
- alignment: element.firstOrEmpty("w:jc").attributes["w:val"],
- numbering: readNumberingProperties(style.styleId, element.firstOrEmpty("w:numPr"), numbering),
- indent: readParagraphIndent(element.firstOrEmpty("w:ind"))
- };
- });
- }
- function readParagraphIndent(element) {
- return {
- start: element.attributes["w:start"] || element.attributes["w:left"],
- end: element.attributes["w:end"] || element.attributes["w:right"],
- firstLine: element.attributes["w:firstLine"],
- hanging: element.attributes["w:hanging"]
- };
- }
- function readRunProperties(element) {
- return readRunStyle(element).map(function(style) {
- var fontSizeString = element.firstOrEmpty("w:sz").attributes["w:val"];
- // w:sz gives the font size in half points, so halve the value to get the size in points
- var fontSize = /^[0-9]+$/.test(fontSizeString) ? parseInt(fontSizeString, 10) / 2 : null;
- return {
- type: "runProperties",
- styleId: style.styleId,
- styleName: style.name,
- verticalAlignment: element.firstOrEmpty("w:vertAlign").attributes["w:val"],
- font: element.firstOrEmpty("w:rFonts").attributes["w:ascii"],
- fontSize: fontSize,
- isBold: readBooleanElement(element.first("w:b")),
- isUnderline: readUnderline(element.first("w:u")),
- isItalic: readBooleanElement(element.first("w:i")),
- isStrikethrough: readBooleanElement(element.first("w:strike")),
- isAllCaps: readBooleanElement(element.first("w:caps")),
- isSmallCaps: readBooleanElement(element.first("w:smallCaps")),
- highlight: readHighlightValue(element.firstOrEmpty("w:highlight").attributes["w:val"])
- };
- });
- }
- function readUnderline(element) {
- if (element) {
- var value = element.attributes["w:val"];
- return value !== undefined && value !== "false" && value !== "0" && value !== "none";
- } else {
- return false;
- }
- }
- function readBooleanElement(element) {
- if (element) {
- var value = element.attributes["w:val"];
- return value !== "false" && value !== "0";
- } else {
- return false;
- }
- }
- function readHighlightValue(value) {
- if (!value || value === "none") {
- return null;
- } else {
- return value;
- }
- }
- function readParagraphStyle(element) {
- return readStyle(element, "w:pStyle", "Paragraph", styles.findParagraphStyleById);
- }
- function readRunStyle(element) {
- return readStyle(element, "w:rStyle", "Run", styles.findCharacterStyleById);
- }
- function readTableStyle(element) {
- return readStyle(element, "w:tblStyle", "Table", styles.findTableStyleById);
- }
- function readStyle(element, styleTagName, styleType, findStyleById) {
- var messages = [];
- var styleElement = element.first(styleTagName);
- var styleId = null;
- var name = null;
- if (styleElement) {
- styleId = styleElement.attributes["w:val"];
- if (styleId) {
- var style = findStyleById(styleId);
- if (style) {
- name = style.name;
- } else {
- messages.push(undefinedStyleWarning(styleType, styleId));
- }
- }
- }
- return elementResultWithMessages({styleId: styleId, name: name}, messages);
- }
- var unknownComplexField = {type: "unknown"};
- function readFldChar(element) {
- var type = element.attributes["w:fldCharType"];
- if (type === "begin") {
- complexFieldStack.push(unknownComplexField);
- currentInstrText = [];
- } else if (type === "end") {
- complexFieldStack.pop();
- } else if (type === "separate") {
- var hyperlinkOptions = parseHyperlinkFieldCode(currentInstrText.join(''));
- var complexField = hyperlinkOptions === null ? unknownComplexField : {type: "hyperlink", options: hyperlinkOptions};
- complexFieldStack.pop();
- complexFieldStack.push(complexField);
- }
- return emptyResult();
- }
- function currentHyperlinkOptions() {
- var topHyperlink = _.last(complexFieldStack.filter(function(complexField) {
- return complexField.type === "hyperlink";
- }));
- return topHyperlink ? topHyperlink.options : null;
- }
- function parseHyperlinkFieldCode(code) {
- var externalLinkResult = /\s*HYPERLINK "(.*)"/.exec(code);
- if (externalLinkResult) {
- return {href: externalLinkResult[1]};
- }
- var internalLinkResult = /\s*HYPERLINK\s+\\l\s+"(.*)"/.exec(code);
- if (internalLinkResult) {
- return {anchor: internalLinkResult[1]};
- }
- return null;
- }
- function readInstrText(element) {
- currentInstrText.push(element.text());
- return emptyResult();
- }
- function readSymbol(element) {
- // See 17.3.3.30 sym (Symbol Character) of ECMA-376 4th edition Part 1
- var font = element.attributes["w:font"];
- var char = element.attributes["w:char"];
- var unicodeCharacter = dingbatToUnicode.hex(font, char);
- if (unicodeCharacter == null && /^F0..$/.test(char)) {
- unicodeCharacter = dingbatToUnicode.hex(font, char.substring(2));
- }
- if (unicodeCharacter == null) {
- return emptyResultWithMessages([warning(
- "A w:sym element with an unsupported character was ignored: char " + char + " in font " + font
- )]);
- } else {
- return elementResult(new documents.Text(unicodeCharacter.string));
- }
- }
- function noteReferenceReader(noteType) {
- return function(element) {
- var noteId = element.attributes["w:id"];
- return elementResult(new documents.NoteReference({
- noteType: noteType,
- noteId: noteId
- }));
- };
- }
- function readCommentReference(element) {
- return elementResult(documents.commentReference({
- commentId: element.attributes["w:id"]
- }));
- }
- function readChildElements(element) {
- return readXmlElements(element.children);
- }
- var xmlElementReaders = {
- "w:p": function(element) {
- var paragraphPropertiesElement = element.firstOrEmpty("w:pPr");
- var isDeleted = !!paragraphPropertiesElement
- .firstOrEmpty("w:rPr")
- .first("w:del");
- if (isDeleted) {
- element.children.forEach(function(child) {
- deletedParagraphContents.push(child);
- });
- return emptyResult();
- } else {
- var childrenXml = element.children;
- if (deletedParagraphContents.length > 0) {
- childrenXml = deletedParagraphContents.concat(childrenXml);
- deletedParagraphContents = [];
- }
- return ReadResult.map(
- readParagraphProperties(paragraphPropertiesElement),
- readXmlElements(childrenXml),
- function(properties, children) {
- return new documents.Paragraph(children, properties);
- }
- ).insertExtra();
- }
- },
- "w:r": function(element) {
- return ReadResult.map(
- readRunProperties(element.firstOrEmpty("w:rPr")),
- readXmlElements(element.children),
- function(properties, children) {
- var hyperlinkOptions = currentHyperlinkOptions();
- if (hyperlinkOptions !== null) {
- children = [new documents.Hyperlink(children, hyperlinkOptions)];
- }
- return new documents.Run(children, properties);
- }
- );
- },
- "w:fldChar": readFldChar,
- "w:instrText": readInstrText,
- "w:t": function(element) {
- return elementResult(new documents.Text(element.text()));
- },
- "w:tab": function(element) {
- return elementResult(new documents.Tab());
- },
- "w:noBreakHyphen": function() {
- return elementResult(new documents.Text("\u2011"));
- },
- "w:softHyphen": function(element) {
- return elementResult(new documents.Text("\u00AD"));
- },
- "w:sym": readSymbol,
- "w:hyperlink": function(element) {
- var relationshipId = element.attributes["r:id"];
- var anchor = element.attributes["w:anchor"];
- return readXmlElements(element.children).map(function(children) {
- function create(options) {
- var targetFrame = element.attributes["w:tgtFrame"] || null;
- return new documents.Hyperlink(
- children,
- _.extend({targetFrame: targetFrame}, options)
- );
- }
- if (relationshipId) {
- var href = relationships.findTargetByRelationshipId(relationshipId);
- if (anchor) {
- href = uris.replaceFragment(href, anchor);
- }
- return create({href: href});
- } else if (anchor) {
- return create({anchor: anchor});
- } else {
- return children;
- }
- });
- },
- "w:tbl": readTable,
- "w:tr": readTableRow,
- "w:tc": readTableCell,
- "w:footnoteReference": noteReferenceReader("footnote"),
- "w:endnoteReference": noteReferenceReader("endnote"),
- "w:commentReference": readCommentReference,
- "w:br": function(element) {
- var breakType = element.attributes["w:type"];
- if (breakType == null || breakType === "textWrapping") {
- return elementResult(documents.lineBreak);
- } else if (breakType === "page") {
- return elementResult(documents.pageBreak);
- } else if (breakType === "column") {
- return elementResult(documents.columnBreak);
- } else {
- return emptyResultWithMessages([warning("Unsupported break type: " + breakType)]);
- }
- },
- "w:bookmarkStart": function(element){
- var name = element.attributes["w:name"];
- if (name === "_GoBack") {
- return emptyResult();
- } else {
- return elementResult(new documents.BookmarkStart({name: name}));
- }
- },
- "mc:AlternateContent": function(element) {
- return readChildElements(element.first("mc:Fallback"));
- },
- "w:sdt": function(element) {
- return readXmlElements(element.firstOrEmpty("w:sdtContent").children);
- },
- "w:ins": readChildElements,
- "w:object": readChildElements,
- "w:smartTag": readChildElements,
- "w:drawing": readChildElements,
- "w:pict": function(element) {
- return readChildElements(element).toExtra();
- },
- "v:roundrect": readChildElements,
- "v:shape": readChildElements,
- "v:textbox": readChildElements,
- "w:txbxContent": readChildElements,
- "wp:inline": readDrawingElement,
- "wp:anchor": readDrawingElement,
- "v:imagedata": readImageData,
- "v:group": readChildElements,
- "v:rect": readChildElements
- };
- return {
- readXmlElement: readXmlElement,
- readXmlElements: readXmlElements
- };
- function readTable(element) {
- var propertiesResult = readTableProperties(element.firstOrEmpty("w:tblPr"));
- return readXmlElements(element.children)
- .flatMap(calculateRowSpans)
- .flatMap(function(children) {
- return propertiesResult.map(function(properties) {
- return documents.Table(children, properties);
- });
- });
- }
- function readTableProperties(element) {
- return readTableStyle(element).map(function(style) {
- return {
- styleId: style.styleId,
- styleName: style.name
- };
- });
- }
- function readTableRow(element) {
- var properties = element.firstOrEmpty("w:trPr");
- var isHeader = !!properties.first("w:tblHeader");
- return readXmlElements(element.children).map(function(children) {
- return documents.TableRow(children, {isHeader: isHeader});
- });
- }
- function readTableCell(element) {
- return readXmlElements(element.children).map(function(children) {
- var properties = element.firstOrEmpty("w:tcPr");
- var gridSpan = properties.firstOrEmpty("w:gridSpan").attributes["w:val"];
- var colSpan = gridSpan ? parseInt(gridSpan, 10) : 1;
- var cell = documents.TableCell(children, {colSpan: colSpan});
- cell._vMerge = readVMerge(properties);
- return cell;
- });
- }
- function readVMerge(properties) {
- var element = properties.first("w:vMerge");
- if (element) {
- var val = element.attributes["w:val"];
- return val === "continue" || !val;
- } else {
- return null;
- }
- }
- function calculateRowSpans(rows) {
- var unexpectedNonRows = _.any(rows, function(row) {
- return row.type !== documents.types.tableRow;
- });
- if (unexpectedNonRows) {
- return elementResultWithMessages(rows, [warning(
- "unexpected non-row element in table, cell merging may be incorrect"
- )]);
- }
- var unexpectedNonCells = _.any(rows, function(row) {
- return _.any(row.children, function(cell) {
- return cell.type !== documents.types.tableCell;
- });
- });
- if (unexpectedNonCells) {
- return elementResultWithMessages(rows, [warning(
- "unexpected non-cell element in table row, cell merging may be incorrect"
- )]);
- }
- var columns = {};
- rows.forEach(function(row) {
- var cellIndex = 0;
- row.children.forEach(function(cell) {
- if (cell._vMerge && columns[cellIndex]) {
- columns[cellIndex].rowSpan++;
- } else {
- columns[cellIndex] = cell;
- cell._vMerge = false;
- }
- cellIndex += cell.colSpan;
- });
- });
- rows.forEach(function(row) {
- row.children = row.children.filter(function(cell) {
- return !cell._vMerge;
- });
- row.children.forEach(function(cell) {
- delete cell._vMerge;
- });
- });
- return elementResult(rows);
- }
- function readDrawingElement(element) {
- var blips = element
- .getElementsByTagName("a:graphic")
- .getElementsByTagName("a:graphicData")
- .getElementsByTagName("pic:pic")
- .getElementsByTagName("pic:blipFill")
- .getElementsByTagName("a:blip");
- return combineResults(blips.map(readBlip.bind(null, element)));
- }
- function readBlip(element, blip) {
- var properties = element.first("wp:docPr").attributes;
- var altText = isBlank(properties.descr) ? properties.title : properties.descr;
- var blipImageFile = findBlipImageFile(blip);
- if (blipImageFile === null) {
- return emptyResultWithMessages([warning("Could not find image file for a:blip element")]);
- } else {
- return readImage(blipImageFile, altText);
- }
- }
- function isBlank(value) {
- return value == null || /^\s*$/.test(value);
- }
- function findBlipImageFile(blip) {
- var embedRelationshipId = blip.attributes["r:embed"];
- var linkRelationshipId = blip.attributes["r:link"];
- if (embedRelationshipId) {
- return findEmbeddedImageFile(embedRelationshipId);
- } else if (linkRelationshipId) {
- var imagePath = relationships.findTargetByRelationshipId(linkRelationshipId);
- return {
- path: imagePath,
- read: files.read.bind(files, imagePath)
- };
- } else {
- return null;
- }
- }
- function readImageData(element) {
- var relationshipId = element.attributes['r:id'];
- if (relationshipId) {
- return readImage(
- findEmbeddedImageFile(relationshipId),
- element.attributes["o:title"]);
- } else {
- return emptyResultWithMessages([warning("A v:imagedata element without a relationship ID was ignored")]);
- }
- }
- function findEmbeddedImageFile(relationshipId) {
- var path = uris.uriToZipEntryName("word", relationships.findTargetByRelationshipId(relationshipId));
- return {
- path: path,
- read: docxFile.read.bind(docxFile, path)
- };
- }
- function readImage(imageFile, altText) {
- var contentType = contentTypes.findContentType(imageFile.path);
- var image = documents.Image({
- readImage: imageFile.read,
- altText: altText,
- contentType: contentType
- });
- var warnings = supportedImageTypes[contentType] ?
- [] : warning("Image of type " + contentType + " is unlikely to display in web browsers");
- return elementResultWithMessages(image, warnings);
- }
- function undefinedStyleWarning(type, styleId) {
- return warning(
- type + " style with ID " + styleId + " was referenced but not defined in the document");
- }
- }
- function readNumberingProperties(styleId, element, numbering) {
- var level = element.firstOrEmpty("w:ilvl").attributes["w:val"];
- var numId = element.firstOrEmpty("w:numId").attributes["w:val"];
- if (level !== undefined && numId !== undefined) {
- return numbering.findLevel(numId, level);
- }
- if (styleId != null) {
- var levelByStyleId = numbering.findLevelByParagraphStyleId(styleId);
- if (levelByStyleId != null) {
- return levelByStyleId;
- }
- }
- return null;
- }
- var supportedImageTypes = {
- "image/png": true,
- "image/gif": true,
- "image/jpeg": true,
- "image/svg+xml": true,
- "image/tiff": true
- };
- var ignoreElements = {
- "office-word:wrap": true,
- "v:shadow": true,
- "v:shapetype": true,
- "w:annotationRef": true,
- "w:bookmarkEnd": true,
- "w:sectPr": true,
- "w:proofErr": true,
- "w:lastRenderedPageBreak": true,
- "w:commentRangeStart": true,
- "w:commentRangeEnd": true,
- "w:del": true,
- "w:footnoteRef": true,
- "w:endnoteRef": true,
- "w:pPr": true,
- "w:rPr": true,
- "w:tblPr": true,
- "w:tblGrid": true,
- "w:trPr": true,
- "w:tcPr": true
- };
- function emptyResultWithMessages(messages) {
- return new ReadResult(null, null, messages);
- }
- function emptyResult() {
- return new ReadResult(null);
- }
- function elementResult(element) {
- return new ReadResult(element);
- }
- function elementResultWithMessages(element, messages) {
- return new ReadResult(element, null, messages);
- }
- function ReadResult(element, extra, messages) {
- this.value = element || [];
- this.extra = extra || [];
- this._result = new Result({
- element: this.value,
- extra: extra
- }, messages);
- this.messages = this._result.messages;
- }
- ReadResult.prototype.toExtra = function() {
- return new ReadResult(null, joinElements(this.extra, this.value), this.messages);
- };
- ReadResult.prototype.insertExtra = function() {
- var extra = this.extra;
- if (extra && extra.length) {
- return new ReadResult(joinElements(this.value, extra), null, this.messages);
- } else {
- return this;
- }
- };
- ReadResult.prototype.map = function(func) {
- var result = this._result.map(function(value) {
- return func(value.element);
- });
- return new ReadResult(result.value, this.extra, result.messages);
- };
- ReadResult.prototype.flatMap = function(func) {
- var result = this._result.flatMap(function(value) {
- return func(value.element)._result;
- });
- return new ReadResult(result.value.element, joinElements(this.extra, result.value.extra), result.messages);
- };
- ReadResult.map = function(first, second, func) {
- return new ReadResult(
- func(first.value, second.value),
- joinElements(first.extra, second.extra),
- first.messages.concat(second.messages)
- );
- };
- function combineResults(results) {
- var result = Result.combine(_.pluck(results, "_result"));
- return new ReadResult(
- _.flatten(_.pluck(result.value, "element")),
- _.filter(_.flatten(_.pluck(result.value, "extra")), identity),
- result.messages
- );
- }
- function joinElements(first, second) {
- return _.flatten([first, second]);
- }
- function identity(value) {
- return value;
- }
|