% File load-unicode-xetex-classes.tex
%
% Copyright 2015-2023 The LaTeX Project
%
% It may be distributed and/or modified under the conditions of
% the LaTeX Project Public License (LPPL), either version 1.3c of
% this license or (at your option) any later version. The latest
% version of this license is in the file
% http://www.latex-project.org/lppl.txt.
%
% Issues with this file should be reported at
% https://github.com/latex3/unicode-data
%
% This file parses EastAsianWidth.txt and LineBreak.txt, provided by the
% Unicode Consortium, and when used with XeTeX sets \XeTeXcharclass for
% the following classes of code point:
% - "ID" (ideographic)
% - "CJ" (conditional Japanese starter)
% - "OP" (opener)
% - "CL" (closer)
% - "NS" (non-starter)
% - "EX" (exclamation)
% - "IS" (infix separator)
% - "CM" (combining marks)
%
% All code points of classes "ID" and "CJ" are assigned to a \XeTeXcharclass,
% but for other classes this only occurs when they fall into east Asian width
% type "F", "H" or "W" (full-, half- and wide-width).
%
% The following mappings between Unicode and XeTeX classes occur
% - "ID" and "CJ" are class 1
% - "OP" is class 2
% - "CL", "NS", "EX", "IS" are class 3
% - "CM" is class 256 (ignored)
% as standard: these may be over-ridden by defining \XeTeXcharclass<class>
% as required. (If classes "ID" or "CL" are explicitly set, the other members
% of the same groups above will inherit these values.)
%
% This file does _not_ activate XeTeX's inter-character token mechanism
% (\XeTeXinterchartokenstate is not set) nor does it install any material in
% the inter-character token registers.
%
% Note that this file is separate from the main loader as the data structure
% here may need more refinement at the macro level.
%
% =============================================================================
%
% The data loaded here can currently only be used by XeTeX: check for the
% appropriate primitive.
\ifx\XeTeXcharclass\undefined
  \expandafter\endinput
\fi
% Just in case, check for the e-TeX extensions.
\ifx\eTeXversion\undefined
  \expandafter\endinput
\fi
% This file can be loaded in IniTeX mode so the category codes of |{|, |}| and
% |#| may not be correct. Everything is done in a group so that only the
% settings we want to propagate are made available generally.
\begingroup
  \catcode`\{=1 %
  \catcode`\}=2 %
% Write some basic information to the log.
  \catcode`\^=7 %
  \newlinechar=`\^^J %
  \message{^^J}%
  \message{load-unicode-xetex-classes.tex v1.17 (2023-09-18)^^J}%
  \message{Reading Unicode east Asian character class data^^J}%
% A string version of |#| will be needed to look for comment lines in the
% source. Once that is done proper parsing can begin.
  \catcode`\#=12 %
  \def\hash{#}%
  \catcode`\#=6 %
  \def\firsttoken#1#2\relax{#1}%
  \def\parseunicodedataI#1\relax{%
    \unless\if\hash\firsttoken#1?\relax
      \parseunicodedataII#1\relax
    \fi
  }%
% Both files to be parsed here have potential ranges of code points: find the
% first entry and search for the second.
  \def\parseunicodedataII#1; #2 #3\relax{%
    \parseunicodedataIII#1....\relax{#2}%
  }%
% From plain: may not be defined (yet).
  \def\loop#1\repeat{\def\body{#1}\iterate}%
  \def\iterate{%
    \body
      \let\next\iterate
    \else
      \let\next\relax
    \fi
    \next
  }%
  \let\repeat\fi
% A shared routine for reading the data files: only one part of the parser
% has to be altered.
  \def\storedpar{\par}%
  \def\readandparse#1{%
    \openin0=#1.txt %
% Read two lines from the source file to extract the version information
    \catcode`\#=12 %
    \read0 to \unicodedataline
    \message{\unicodedataline ^^J}%
    \read0 to \unicodedataline
    \message{\unicodedataline ^^J}%
    \loop\unless\ifeof0 %
      \read0 to \unicodedataline
      \unless\ifx\unicodedataline\storedpar
        \expandafter\parseunicodedataI\unicodedataline\relax
      \fi
    \repeat
    \catcode`\#=6 %
    \closein0 %
  }%
% Set up the different line break classes recognised.
  \ifdefined\XeTeXcharclassID
  \else
    \chardef\XeTeXcharclassID=1 %
  \fi
  \ifdefined\XeTeXcharclassCJ
  \else
    \let\XeTeXcharclassCJ\XeTeXcharclassID
  \fi
  \ifdefined\XeTeXcharclassOP
  \else
    \chardef\XeTeXcharclassOP=2 %
  \fi
  \ifdefined\XeTeXcharclassCL
  \else
    \chardef\XeTeXcharclassCL=3 %
  \fi
  \ifdefined\XeTeXcharclassEX
  \else
    \let\XeTeXcharclassEX\XeTeXcharclassCL
  \fi
  \ifdefined\XeTeXcharclassIS
  \else
    \let\XeTeXcharclassIS\XeTeXcharclassCL
  \fi
  \ifdefined\XeTeXcharclassNS
  \else
    \let\XeTeXcharclassNS\XeTeXcharclassCL
  \fi
  \ifdefined\XeTeXcharclassCM
  \else
    \chardef\XeTeXcharclassCM=256 %
  \fi
% Check the line break class and if necessary the east Asian width for the
% current code point. For code points of class |ID| or |CJ| there may be a
% range to set, and these are always recorded. In other cases if the code point
% is one of those we may need to set up then save it for checking against the
% list of east Asian widths.
  \def\ID{ID}%
  \def\CJ{CJ}%
  \def\parseunicodedataIII#1..#2..#3\relax#4{%
    \def\temp{#4}%
    \ifnum 0%
      \ifx\temp\ID 1\fi
      \ifx\temp\CJ 1\fi
      >0 %
      \ifx\relax#2\relax
        \parseunicodedataIV{#1}{#1}{#4}%
      \else
        \parseunicodedataIV{#1}{#2}{#4}%
      \fi
    \else
      \ifcsname XeTeXcharclass#4\endcsname
        \ifx\relax#2\relax
          \expandafter\def\csname LB@\number"#1\endcsname{#4}%
        \else
          \let\savedbody\body
          \count0="#1 %
          \loop
            \unless\ifnum\count0>"#2 %
            \expandafter\def\csname LB@\number\count0 \endcsname{#4}%
            \advance\count0 by 1 %
          \repeat
          \let\body\savedbody
        \fi
      \fi
    \fi
  }%
% As we are inside a loop already, there needs to be a group here to preserve
% the iterator.
  \def\parseunicodedataIV#1#2#3{%
    \begingroup
      \count0="#1 %
      \loop
        \unless\ifnum\count0>"#2 %
          \global\XeTeXcharclass\count0=\csname XeTeXcharclass#3\endcsname
          \advance\count0 by 1 %
      \repeat
    \endgroup
  }%
  \readandparse{LineBreak}%
% For |EastAsianWidth.txt|, action is only needed if the character has width
% |F|, |H| or |W|. Once again there may be a range of characters to handle.
  \def\parseunicodedataIII#1..#2..#3\relax#4{%
    \ifnum 0%
      \if F\firsttoken#4\relax 1\fi
      \if H\firsttoken#4\relax 1\fi
      \if W\firsttoken#4\relax 1\fi
       >0 %
      \ifx\relax#2\relax
        \parseunicodedataIV{"#1}%
      \else
        \begingroup
          \count0="#1 %
          \loop
            \unless\ifnum\count0>"#2 %
              \parseunicodedataIV{\count0}%
              \advance\count0 by 1 %
          \repeat
        \endgroup
      \fi
    \fi
  }%
% Only take action if a line breaking class was previously saved: that will
% map to the correct class number.
  \def\parseunicodedataIV#1{%
    \ifcsname LB@\number#1\endcsname
      \global\XeTeXcharclass#1=
        \csname XeTeXcharclass\csname LB@\number#1\endcsname\endcsname
    \fi
  }%
  \readandparse{EastAsianWidth}%
\endgroup