/**
 * W3C XML Schema Definition Language (XSD) 1.1 Part 2: Datatypes.
 * --------------------------------------------------------------
 *  A W3C Recommendation published on 2012-04-05
 *  @see https://www.w3.org/TR/xmlschema11-2/
 */
import { createToken, EmbeddedActionsParser, Lexer } from "chevrotain";

// The tokens are really large because then we can already disambiguate during
// tokenisation, and we don't have to do any backtracking during parsing
/** [34] B64char ::= [A-Za-z0-9+/] */
const B64charPattern = "[A-Za-z0-9+/]";
/**
 * This represents three octets of binary data without trailing space.
 * [30] B64finalquad ::= (B64 B64 B64 B64char)
 * [33] B64 ::= B64char #x20?
 * PS. #x20 is a space character
 */
const B64finalquadPattern = `${B64charPattern} ?${B64charPattern} ?${B64charPattern} ?${B64charPattern}`;

/**
 * [36]   B16char ::= [AEIMQUYcgkosw048]
 * Base64 characters whose bit-string value ends in '00'
 */

const B16charPattern = "[AEIMQUYcgkosw048]";

/**
 * [38]   B04char ::= [AQgw]
 * Base64 characters whose bit-string value ends in '0000'
 */

const B04charPattern = "[AQgw]";

/**
 * This represents three octets of binary data.
 *
 * [28] B64quad ::= (B64 B64 B64 B64)
 * [33] B64 ::= B64char #x20?
 * [34] B64char ::= [A-Za-z0-9+/]
 *
 * However, to remove tokenisation ambiguity, we make the trailing space
 * required in this token. Because the lexer greedily uses the first matching
 * token, this token (the one with a trailing space) must come before the
 * token without a trailing space.
 *
 * PS. #x20 is a space character.
 */
export const B64quadWithTrailingSpace = createToken({
  name: "B64quad",
  pattern: new RegExp(B64finalquadPattern + " "),
});

/**
 * This represents three octets of binary data without trailing space.
 *
 * [30] B64finalquad ::= (B64 B64 B64 B64char)
 */
export const B64quadWithoutTrailingSpace = createToken({
  name: "B64finalquad",
  pattern: new RegExp(B64finalquadPattern),
});

/**
 * This represents a two-octet at the end of the data.
 *
 * [31] Padded16 ::= B64 B64 B16 '='
 *
 * Base64 characters whose bit-string value ends in '00'.
 * [35] B16 ::= B16char #x20?
 * [36] B16char ::= [AEIMQUYcgkosw048]
 */
export const Padded16 = createToken({
  name: "Padded16",
  pattern: new RegExp(`${B64charPattern} ?${B64charPattern} ?${B16charPattern} ?=`),
});

/**
 * This represents a single octet at the end of the data.
 *
 * [32] Padded8 ::= B64 B04 '=' #x20? '='
 *
 * Base64 characters whose bit-string value ends in '0000'.
 * [37] B04 ::= B04char #x20?
 * [38] B04char ::= [AQgw]
 */
export const Padded8 = createToken({
  name: "Padded8",
  pattern: new RegExp(`${B64charPattern} ?${B04charPattern} ?= ?=`),
});
const tokens = [B64quadWithTrailingSpace, B64quadWithoutTrailingSpace, Padded16, Padded8];
export class Base64Parser extends EmbeddedActionsParser {
  constructor() {
    super(tokens, {
      // https://chevrotain.io/docs/guide/initialization_performance.html#use-a-smaller-global-maxlookahead
      // Even though we don't expect big performance difference between this value and the default (maxLookAhead=3),
      // we kept it here as a best practice for when designing rules in the future.
      maxLookahead: 2,
    });
    this.performSelfAnalysis();
  }

  public base64Binary = this.RULE("Base64Binary", () => {
    let result = "";
    // Original grammar:
    // [27] Base64Binary ::= (B64quad* B64final)?
    // [28] B64quad ::= (B64 B64 B64 B64)
    // [29] B64final ::= B64finalquad | Padded16 | Padded8
    // [30] B64finalquad ::= (B64 B64 B64 B64char)

    // However, `B64quad` is the same as `B64finalquad`, except that it has an optional trailing space.
    // This is ambiguous for the lexer, so instead we have two different tokens:
    // - B64quadWithTrailingSpace
    // - B64quadWithoutTrailingSpace

    // And the (equivalent) grammar looks like this:
    // [27] Base64Binary ::= ((B64quadWithoutTrailingSpace | B64quadWithoutTrailingSpace)* B64final)?
    // [29] B64final ::= B64quadWithoutTrailingSpace | Padded16 | Padded8

    // However, because `this.MANY` is greedy, we can't say "consume everything
    // except for the last B64quadWithoutTrailingSpace", so instead we do:
    // - consume as many B64quadWith(out)TrailingSpace as possible
    // - optionally consume Padded16 or Padded8
    // - throw an error if the last character is a " "
    this.MANY(() => {
      result += this.OR([
        { ALT: () => this.CONSUME(B64quadWithTrailingSpace) },
        { ALT: () => this.CONSUME(B64quadWithoutTrailingSpace) },
      ]).image;
    });

    this.OPTION(() => {
      result += this.OR2([{ ALT: () => this.CONSUME(Padded16) }, { ALT: () => this.CONSUME(Padded8) }]).image;
    });
    this.ACTION(() => {
      if (result.endsWith(" ")) throw new RangeError("No trailing space allowed for Base64Binary");
    });
    return result.replaceAll(" ", "");
  });
}
export const lexer = new Lexer(tokens, {
  ensureOptimizations: true,
  // not tracking lines for this grammar (not setting this will print a warning)
  positionTracking: "onlyOffset",
});
export const parser = new Base64Parser();
