beall-11ty/node_modules/@rgrove/parse-xml/dist/lib/Parser.js

"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
    if (k2 === undefined) k2 = k;
    var desc = Object.getOwnPropertyDescriptor(m, k);
    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
      desc = { enumerable: true, get: function() { return m[k]; } };
    }
    Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
    if (k2 === undefined) k2 = k;
    o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
    Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
    o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
    if (mod && mod.__esModule) return mod;
    var result = {};
    if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
    __setModuleDefault(result, mod);
    return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Parser = void 0;
const StringScanner_js_1 = require("./StringScanner.js");
const syntax = __importStar(require("./syntax.js"));
const XmlCdata_js_1 = require("./XmlCdata.js");
const XmlComment_js_1 = require("./XmlComment.js");
const XmlDeclaration_js_1 = require("./XmlDeclaration.js");
const XmlDocument_js_1 = require("./XmlDocument.js");
const XmlDocumentType_js_1 = require("./XmlDocumentType.js");
const XmlElement_js_1 = require("./XmlElement.js");
const XmlError_js_1 = require("./XmlError.js");
const XmlNode_js_1 = require("./XmlNode.js");
const XmlProcessingInstruction_js_1 = require("./XmlProcessingInstruction.js");
const XmlText_js_1 = require("./XmlText.js");
const emptyString = '';
/**
 * Parses an XML string into an `XmlDocument`.
 *
 * @private
 */
class Parser {
    /**
     * @param xml XML string to parse.
     * @param options Parser options.
     */
    constructor(xml, options = {}) {
        let doc = this.document = new XmlDocument_js_1.XmlDocument();
        this.currentNode = doc;
        this.options = options;
        this.scanner = new StringScanner_js_1.StringScanner(xml);
        if (this.options.includeOffsets) {
            doc.start = 0;
            doc.end = xml.length;
        }
        this.parse();
    }
    /**
     * Adds the given `XmlNode` as a child of `this.currentNode`.
     */
    addNode(node, charIndex) {
        node.parent = this.currentNode;
        if (this.options.includeOffsets) {
            node.start = this.scanner.charIndexToByteIndex(charIndex);
            node.end = this.scanner.charIndexToByteIndex();
        }
        // @ts-expect-error: XmlDocument has a more limited set of possible children
        // than XmlElement so TypeScript is unhappy, but we always do the right
        // thing.
        this.currentNode.children.push(node);
        return true;
    }
    /**
     * Adds the given _text_ to the document, either by appending it to a
     * preceding `XmlText` node (if possible) or by creating a new `XmlText` node.
     */
    addText(text, charIndex) {
        let { children } = this.currentNode;
        let { length } = children;
        text = normalizeLineBreaks(text);
        if (length > 0) {
            let prevNode = children[length - 1];
            if (prevNode?.type === XmlNode_js_1.XmlNode.TYPE_TEXT) {
                let textNode = prevNode;
                // The previous node is a text node, so we can append to it and avoid
                // creating another node.
                textNode.text += text;
                if (this.options.includeOffsets) {
                    textNode.end = this.scanner.charIndexToByteIndex();
                }
                return true;
            }
        }
        return this.addNode(new XmlText_js_1.XmlText(text), charIndex);
    }
    /**
     * Consumes element attributes.
     *
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#sec-starttags
     */
    consumeAttributes() {
        let attributes = Object.create(null);
        while (this.consumeWhitespace()) {
            let attrName = this.consumeName();
            if (!attrName) {
                break;
            }
            let attrValue = this.consumeEqual() && this.consumeAttributeValue();
            if (attrValue === false) {
                throw this.error('Attribute value expected');
            }
            if (attrName in attributes) {
                throw this.error(`Duplicate attribute: ${attrName}`);
            }
            if (attrName === 'xml:space'
                && attrValue !== 'default'
                && attrValue !== 'preserve') {
                throw this.error('Value of the `xml:space` attribute must be "default" or "preserve"');
            }
            attributes[attrName] = attrValue;
        }
        if (this.options.sortAttributes) {
            let attrNames = Object.keys(attributes).sort();
            let sortedAttributes = Object.create(null);
            for (let i = 0; i < attrNames.length; ++i) {
                let attrName = attrNames[i];
                sortedAttributes[attrName] = attributes[attrName];
            }
            attributes = sortedAttributes;
        }
        return attributes;
    }
    /**
     * Consumes an `AttValue` (attribute value) if possible.
     *
     * @returns
     *   Contents of the `AttValue` minus quotes, or `false` if nothing was
     *   consumed. An empty string indicates that an `AttValue` was consumed but
     *   was empty.
     *
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-AttValue
     */
    consumeAttributeValue() {
        let { scanner } = this;
        let quote = scanner.peek();
        if (quote !== '"' && quote !== "'") {
            return false;
        }
        scanner.advance();
        let chars;
        let isClosed = false;
        let value = emptyString;
        let regex = quote === '"'
            ? syntax.attValueCharDoubleQuote
            : syntax.attValueCharSingleQuote;
        matchLoop: while (!scanner.isEnd) {
            chars = scanner.consumeUntilMatch(regex);
            if (chars) {
                this.validateChars(chars);
                value += chars.replace(syntax.attValueNormalizedWhitespace, ' ');
            }
            switch (scanner.peek()) {
                case quote:
                    isClosed = true;
                    break matchLoop;
                case '&':
                    value += this.consumeReference();
                    continue;
                case '<':
                    throw this.error('Unescaped `<` is not allowed in an attribute value');
                default:
                    break matchLoop;
            }
        }
        if (!isClosed) {
            throw this.error('Unclosed attribute');
        }
        scanner.advance();
        return value;
    }
    /**
     * Consumes a CDATA section if possible.
     *
     * @returns Whether a CDATA section was consumed.
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#sec-cdata-sect
     */
    consumeCdataSection() {
        let { scanner } = this;
        let startIndex = scanner.charIndex;
        if (!scanner.consumeString('<![CDATA[')) {
            return false;
        }
        let text = scanner.consumeUntilString(']]>');
        this.validateChars(text);
        if (!scanner.consumeString(']]>')) {
            throw this.error('Unclosed CDATA section');
        }
        return this.options.preserveCdata
            ? this.addNode(new XmlCdata_js_1.XmlCdata(normalizeLineBreaks(text)), startIndex)
            : this.addText(text, startIndex);
    }
    /**
     * Consumes character data if possible.
     *
     * @returns Whether character data was consumed.
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#dt-chardata
     */
    consumeCharData() {
        let { scanner } = this;
        let startIndex = scanner.charIndex;
        let charData = scanner.consumeUntilMatch(syntax.endCharData);
        if (!charData) {
            return false;
        }
        this.validateChars(charData);
        if (scanner.peek(3) === ']]>') {
            throw this.error('Element content may not contain the CDATA section close delimiter `]]>`');
        }
        return this.addText(charData, startIndex);
    }
    /**
     * Consumes a comment if possible.
     *
     * @returns Whether a comment was consumed.
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-Comment
     */
    consumeComment() {
        let { scanner } = this;
        let startIndex = scanner.charIndex;
        if (!scanner.consumeString('<!--')) {
            return false;
        }
        let content = scanner.consumeUntilString('--');
        this.validateChars(content);
        if (!scanner.consumeString('-->')) {
            if (scanner.peek(2) === '--') {
                throw this.error("The string `--` isn't allowed inside a comment");
            }
            throw this.error('Unclosed comment');
        }
        return this.options.preserveComments
            ? this.addNode(new XmlComment_js_1.XmlComment(normalizeLineBreaks(content)), startIndex)
            : true;
    }
    /**
     * Consumes a reference in a content context if possible.
     *
     * This differs from `consumeReference()` in that a consumed reference will be
     * added to the document as a text node instead of returned.
     *
     * @returns Whether a reference was consumed.
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#entproc
     */
    consumeContentReference() {
        let startIndex = this.scanner.charIndex;
        let ref = this.consumeReference();
        return ref
            ? this.addText(ref, startIndex)
            : false;
    }
    /**
     * Consumes a doctype declaration if possible.
     *
     * This is a loose implementation since doctype declarations are currently
     * discarded without further parsing.
     *
     * @returns Whether a doctype declaration was consumed.
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#dtd
     */
    consumeDoctypeDeclaration() {
        let { scanner } = this;
        let startIndex = scanner.charIndex;
        if (!scanner.consumeString('<!DOCTYPE')) {
            return false;
        }
        let name = this.consumeWhitespace()
            && this.consumeName();
        if (!name) {
            throw this.error('Expected a name');
        }
        let publicId;
        let systemId;
        if (this.consumeWhitespace()) {
            if (scanner.consumeString('PUBLIC')) {
                publicId = this.consumeWhitespace()
                    && this.consumePubidLiteral();
                if (publicId === false) {
                    throw this.error('Expected a public identifier');
                }
                this.consumeWhitespace();
            }
            if (publicId !== undefined || scanner.consumeString('SYSTEM')) {
                this.consumeWhitespace();
                systemId = this.consumeSystemLiteral();
                if (systemId === false) {
                    throw this.error('Expected a system identifier');
                }
                this.consumeWhitespace();
            }
        }
        let internalSubset;
        if (scanner.consumeString('[')) {
            // The internal subset may contain comments that contain `]` characters,
            // so we can't use `consumeUntilString()` here.
            internalSubset = scanner.consumeUntilMatch(/\][\x20\t\r\n]*>/);
            if (!scanner.consumeString(']')) {
                throw this.error('Unclosed internal subset');
            }
            this.consumeWhitespace();
        }
        if (!scanner.consumeString('>')) {
            throw this.error('Unclosed doctype declaration');
        }
        return this.options.preserveDocumentType
            ? this.addNode(new XmlDocumentType_js_1.XmlDocumentType(name, publicId, systemId, internalSubset), startIndex)
            : true;
    }
    /**
     * Consumes an element if possible.
     *
     * @returns Whether an element was consumed.
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-element
     */
    consumeElement() {
        let { scanner } = this;
        let startIndex = scanner.charIndex;
        if (!scanner.consumeString('<')) {
            return false;
        }
        let name = this.consumeName();
        if (!name) {
            scanner.reset(startIndex);
            return false;
        }
        let attributes = this.consumeAttributes();
        let isEmpty = !!scanner.consumeString('/>');
        let element = new XmlElement_js_1.XmlElement(name, attributes);
        element.parent = this.currentNode;
        if (!isEmpty) {
            if (!scanner.consumeString('>')) {
                throw this.error(`Unclosed start tag for element \`${name}\``);
            }
            this.currentNode = element;
            do {
                this.consumeCharData();
            } while (this.consumeElement()
                || this.consumeContentReference()
                || this.consumeCdataSection()
                || this.consumeProcessingInstruction()
                || this.consumeComment());
            let endTagMark = scanner.charIndex;
            let endTagName;
            if (!scanner.consumeString('</')
                || !(endTagName = this.consumeName())
                || endTagName !== name) {
                scanner.reset(endTagMark);
                throw this.error(`Missing end tag for element ${name}`);
            }
            this.consumeWhitespace();
            if (!scanner.consumeString('>')) {
                throw this.error(`Unclosed end tag for element ${name}`);
            }
            this.currentNode = element.parent;
        }
        return this.addNode(element, startIndex);
    }
    /**
     * Consumes an `Eq` production if possible.
     *
     * @returns Whether an `Eq` production was consumed.
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-Eq
     */
    consumeEqual() {
        this.consumeWhitespace();
        if (this.scanner.consumeString('=')) {
            this.consumeWhitespace();
            return true;
        }
        return false;
    }
    /**
     * Consumes `Misc` content if possible.
     *
     * @returns Whether anything was consumed.
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-Misc
     */
    consumeMisc() {
        return this.consumeComment()
            || this.consumeProcessingInstruction()
            || this.consumeWhitespace();
    }
    /**
     * Consumes one or more `Name` characters if possible.
     *
     * @returns `Name` characters, or an empty string if none were consumed.
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-Name
     */
    consumeName() {
        return syntax.isNameStartChar(this.scanner.peek())
            ? this.scanner.consumeMatchFn(syntax.isNameChar)
            : emptyString;
    }
    /**
     * Consumes a processing instruction if possible.
     *
     * @returns Whether a processing instruction was consumed.
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#sec-pi
     */
    consumeProcessingInstruction() {
        let { scanner } = this;
        let startIndex = scanner.charIndex;
        if (!scanner.consumeString('<?')) {
            return false;
        }
        let name = this.consumeName();
        if (name) {
            if (name.toLowerCase() === 'xml') {
                scanner.reset(startIndex);
                throw this.error("XML declaration isn't allowed here");
            }
        }
        else {
            throw this.error('Invalid processing instruction');
        }
        if (!this.consumeWhitespace()) {
            if (scanner.consumeString('?>')) {
                return this.addNode(new XmlProcessingInstruction_js_1.XmlProcessingInstruction(name), startIndex);
            }
            throw this.error('Whitespace is required after a processing instruction name');
        }
        let content = scanner.consumeUntilString('?>');
        this.validateChars(content);
        if (!scanner.consumeString('?>')) {
            throw this.error('Unterminated processing instruction');
        }
        return this.addNode(new XmlProcessingInstruction_js_1.XmlProcessingInstruction(name, normalizeLineBreaks(content)), startIndex);
    }
    /**
     * Consumes a prolog if possible.
     *
     * @returns Whether a prolog was consumed.
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#sec-prolog-dtd
     */
    consumeProlog() {
        let { scanner } = this;
        let startIndex = scanner.charIndex;
        this.consumeXmlDeclaration();
        while (this.consumeMisc()) { } // eslint-disable-line no-empty
        if (this.consumeDoctypeDeclaration()) {
            while (this.consumeMisc()) { } // eslint-disable-line no-empty
        }
        return startIndex < scanner.charIndex;
    }
    /**
     * Consumes a public identifier literal if possible.
     *
     * @returns
     *   Value of the public identifier literal minus quotes, or `false` if
     *   nothing was consumed. An empty string indicates that a public id literal
     *   was consumed but was empty.
     *
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-PubidLiteral
     */
    consumePubidLiteral() {
        let startIndex = this.scanner.charIndex;
        let value = this.consumeSystemLiteral();
        if (value !== false && !/^[-\x20\r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*$/.test(value)) {
            this.scanner.reset(startIndex);
            throw this.error('Invalid character in public identifier');
        }
        return value;
    }
    /**
     * Consumes a reference if possible.
     *
     * This differs from `consumeContentReference()` in that a consumed reference
     * will be returned rather than added to the document.
     *
     * @returns
     *   Parsed reference value, or `false` if nothing was consumed (to
     *   distinguish from a reference that resolves to an empty string).
     *
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-Reference
     */
    consumeReference() {
        let { scanner } = this;
        if (!scanner.consumeString('&')) {
            return false;
        }
        let ref = scanner.consumeMatchFn(syntax.isReferenceChar);
        if (scanner.consume() !== ';') {
            throw this.error('Unterminated reference (a reference must end with `;`)');
        }
        let parsedValue;
        if (ref[0] === '#') {
            // This is a character reference.
            let codePoint = ref[1] === 'x'
                ? parseInt(ref.slice(2), 16) // Hex codepoint.
                : parseInt(ref.slice(1), 10); // Decimal codepoint.
            if (isNaN(codePoint)) {
                throw this.error('Invalid character reference');
            }
            if (!syntax.isXmlCodePoint(codePoint)) {
                throw this.error('Character reference resolves to an invalid character');
            }
            parsedValue = String.fromCodePoint(codePoint);
        }
        else {
            // This is an entity reference.
            parsedValue = syntax.predefinedEntities[ref];
            if (parsedValue === undefined) {
                let { ignoreUndefinedEntities, resolveUndefinedEntity, } = this.options;
                let wrappedRef = `&${ref};`; // for backcompat with <= 2.x
                if (resolveUndefinedEntity) {
                    let resolvedValue = resolveUndefinedEntity(wrappedRef);
                    if (resolvedValue !== null && resolvedValue !== undefined) {
                        let type = typeof resolvedValue;
                        if (type !== 'string') {
                            throw new TypeError(`\`resolveUndefinedEntity()\` must return a string, \`null\`, or \`undefined\`, but returned a value of type ${type}`);
                        }
                        return resolvedValue;
                    }
                }
                if (ignoreUndefinedEntities) {
                    return wrappedRef;
                }
                scanner.reset(-wrappedRef.length);
                throw this.error(`Named entity isn't defined: ${wrappedRef}`);
            }
        }
        return parsedValue;
    }
    /**
     * Consumes a `SystemLiteral` if possible.
     *
     * A `SystemLiteral` is similar to an attribute value, but allows the
     * characters `<` and `&` and doesn't replace references.
     *
     * @returns
     *   Value of the `SystemLiteral` minus quotes, or `false` if nothing was
     *   consumed. An empty string indicates that a `SystemLiteral` was consumed
     *   but was empty.
     *
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-SystemLiteral
     */
    consumeSystemLiteral() {
        let { scanner } = this;
        let quote = scanner.consumeString('"') || scanner.consumeString("'");
        if (!quote) {
            return false;
        }
        let value = scanner.consumeUntilString(quote);
        this.validateChars(value);
        if (!scanner.consumeString(quote)) {
            throw this.error('Missing end quote');
        }
        return value;
    }
    /**
     * Consumes one or more whitespace characters if possible.
     *
     * @returns Whether any whitespace characters were consumed.
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#white
     */
    consumeWhitespace() {
        return !!this.scanner.consumeMatchFn(syntax.isWhitespace);
    }
    /**
     * Consumes an XML declaration if possible.
     *
     * @returns Whether an XML declaration was consumed.
     * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-XMLDecl
     */
    consumeXmlDeclaration() {
        let { scanner } = this;
        let startIndex = scanner.charIndex;
        if (!scanner.consumeString('<?xml')) {
            return false;
        }
        if (!this.consumeWhitespace()) {
            throw this.error('Invalid XML declaration');
        }
        let version = !!scanner.consumeString('version')
            && this.consumeEqual()
            && this.consumeSystemLiteral();
        if (version === false) {
            throw this.error('XML version is missing or invalid');
        }
        else if (!/^1\.[0-9]+$/.test(version)) {
            throw this.error('Invalid character in version number');
        }
        let encoding;
        let standalone;
        if (this.consumeWhitespace()) {
            encoding = !!scanner.consumeString('encoding')
                && this.consumeEqual()
                && this.consumeSystemLiteral();
            if (encoding) {
                if (!/^[A-Za-z][\w.-]*$/.test(encoding)) {
                    throw this.error('Invalid character in encoding name');
                }
                this.consumeWhitespace();
            }
            standalone = !!scanner.consumeString('standalone')
                && this.consumeEqual()
                && this.consumeSystemLiteral();
            if (standalone) {
                if (standalone !== 'yes' && standalone !== 'no') {
                    throw this.error('Only "yes" and "no" are permitted as values of `standalone`');
                }
                this.consumeWhitespace();
            }
        }
        if (!scanner.consumeString('?>')) {
            throw this.error('Invalid or unclosed XML declaration');
        }
        return this.options.preserveXmlDeclaration
            ? this.addNode(new XmlDeclaration_js_1.XmlDeclaration(version, encoding || undefined, standalone || undefined), startIndex)
            : true;
    }
    /**
     * Returns an `XmlError` for the current scanner position.
     */
    error(message) {
        let { scanner } = this;
        return new XmlError_js_1.XmlError(message, scanner.charIndex, scanner.string);
    }
    /**
     * Parses the XML input.
     */
    parse() {
        this.scanner.consumeString('\uFEFF'); // byte order mark
        this.consumeProlog();
        if (!this.consumeElement()) {
            throw this.error('Root element is missing or invalid');
        }
        while (this.consumeMisc()) { } // eslint-disable-line no-empty
        if (!this.scanner.isEnd) {
            throw this.error('Extra content at the end of the document');
        }
    }
    /**
     * Throws an invalid character error if any character in the given _string_
     * isn't a valid XML character.
     */
    validateChars(string) {
        let { length } = string;
        for (let i = 0; i < length; ++i) {
            let cp = string.codePointAt(i);
            if (!syntax.isXmlCodePoint(cp)) {
                this.scanner.reset(-([...string].length - i));
                throw this.error('Invalid character');
            }
            if (cp > 65535) {
                i += 1;
            }
        }
    }
}
exports.Parser = Parser;
// -- Private Functions --------------------------------------------------------
/**
 * Normalizes line breaks in the given text by replacing CRLF sequences and lone
 * CR characters with LF characters.
 */
function normalizeLineBreaks(text) {
    let i = 0;
    while ((i = text.indexOf('\r', i)) !== -1) {
        text = text[i + 1] === '\n'
            ? text.slice(0, i) + text.slice(i + 1)
            : text.slice(0, i) + '\n' + text.slice(i + 1);
    }
    return text;
}
//# sourceMappingURL=Parser.js.map