% File load-unicode-xetex-classes.tex % % Copyright 2015-2023 The LaTeX Project % % It may be distributed and/or modified under the conditions of % the LaTeX Project Public License (LPPL), either version 1.3c of % this license or (at your option) any later version. The latest % version of this license is in the file % http://www.latex-project.org/lppl.txt. % % Issues with this file should be reported at % https://github.com/latex3/unicode-data % % This file parses EastAsianWidth.txt and LineBreak.txt, provided by the % Unicode Consortium, and when used with XeTeX sets \XeTeXcharclass for % the following classes of code point: % - "ID" (ideographic) % - "CJ" (conditional Japanese starter) % - "OP" (opener) % - "CL" (closer) % - "NS" (non-starter) % - "EX" (exclamation) % - "IS" (infix separator) % - "CM" (combining marks) % % All code points of classes "ID" and "CJ" are assigned to a \XeTeXcharclass, % but for other classes this only occurs when they fall into east Asian width % type "F", "H" or "W" (full-, half- and wide-width). % % The following mappings between Unicode and XeTeX classes occur % - "ID" and "CJ" are class 1 % - "OP" is class 2 % - "CL", "NS", "EX", "IS" are class 3 % - "CM" is class 256 (ignored) % as standard: these may be over-ridden by defining \XeTeXcharclass % as required. (If classes "ID" or "CL" are explicitly set, the other members % of the same groups above will inherit these values.) % % This file does _not_ activate XeTeX's inter-character token mechanism % (\XeTeXinterchartokenstate is not set) nor does it install any material in % the inter-character token registers. % % Note that this file is separate from the main loader as the data structure % here may need more refinement at the macro level. % % ============================================================================= % % The data loaded here can currently only be used by XeTeX: check for the % appropriate primitive. \ifx\XeTeXcharclass\undefined \expandafter\endinput \fi % Just in case, check for the e-TeX extensions. \ifx\eTeXversion\undefined \expandafter\endinput \fi % This file can be loaded in IniTeX mode so the category codes of |{|, |}| and % |#| may not be correct. Everything is done in a group so that only the % settings we want to propagate are made available generally. \begingroup \catcode`\{=1 % \catcode`\}=2 % % Write some basic information to the log. \catcode`\^=7 % \newlinechar=`\^^J % \message{^^J}% \message{load-unicode-xetex-classes.tex v1.17 (2023-09-18)^^J}% \message{Reading Unicode east Asian character class data^^J}% % A string version of |#| will be needed to look for comment lines in the % source. Once that is done proper parsing can begin. \catcode`\#=12 % \def\hash{#}% \catcode`\#=6 % \def\firsttoken#1#2\relax{#1}% \def\parseunicodedataI#1\relax{% \unless\if\hash\firsttoken#1?\relax \parseunicodedataII#1\relax \fi }% % Both files to be parsed here have potential ranges of code points: find the % first entry and search for the second. \def\parseunicodedataII#1; #2 #3\relax{% \parseunicodedataIII#1....\relax{#2}% }% % From plain: may not be defined (yet). \def\loop#1\repeat{\def\body{#1}\iterate}% \def\iterate{% \body \let\next\iterate \else \let\next\relax \fi \next }% \let\repeat\fi % A shared routine for reading the data files: only one part of the parser % has to be altered. \def\storedpar{\par}% \def\readandparse#1{% \openin0=#1.txt % % Read two lines from the source file to extract the version information \catcode`\#=12 % \read0 to \unicodedataline \message{\unicodedataline ^^J}% \read0 to \unicodedataline \message{\unicodedataline ^^J}% \loop\unless\ifeof0 % \read0 to \unicodedataline \unless\ifx\unicodedataline\storedpar \expandafter\parseunicodedataI\unicodedataline\relax \fi \repeat \catcode`\#=6 % \closein0 % }% % Set up the different line break classes recognised. \ifdefined\XeTeXcharclassID \else \chardef\XeTeXcharclassID=1 % \fi \ifdefined\XeTeXcharclassCJ \else \let\XeTeXcharclassCJ\XeTeXcharclassID \fi \ifdefined\XeTeXcharclassOP \else \chardef\XeTeXcharclassOP=2 % \fi \ifdefined\XeTeXcharclassCL \else \chardef\XeTeXcharclassCL=3 % \fi \ifdefined\XeTeXcharclassEX \else \let\XeTeXcharclassEX\XeTeXcharclassCL \fi \ifdefined\XeTeXcharclassIS \else \let\XeTeXcharclassIS\XeTeXcharclassCL \fi \ifdefined\XeTeXcharclassNS \else \let\XeTeXcharclassNS\XeTeXcharclassCL \fi \ifdefined\XeTeXcharclassCM \else \chardef\XeTeXcharclassCM=256 % \fi % Check the line break class and if necessary the east Asian width for the % current code point. For code points of class |ID| or |CJ| there may be a % range to set, and these are always recorded. In other cases if the code point % is one of those we may need to set up then save it for checking against the % list of east Asian widths. \def\ID{ID}% \def\CJ{CJ}% \def\parseunicodedataIII#1..#2..#3\relax#4{% \def\temp{#4}% \ifnum 0% \ifx\temp\ID 1\fi \ifx\temp\CJ 1\fi >0 % \ifx\relax#2\relax \parseunicodedataIV{#1}{#1}{#4}% \else \parseunicodedataIV{#1}{#2}{#4}% \fi \else \ifcsname XeTeXcharclass#4\endcsname \ifx\relax#2\relax \expandafter\def\csname LB@\number"#1\endcsname{#4}% \else \let\savedbody\body \count0="#1 % \loop \unless\ifnum\count0>"#2 % \expandafter\def\csname LB@\number\count0 \endcsname{#4}% \advance\count0 by 1 % \repeat \let\body\savedbody \fi \fi \fi }% % As we are inside a loop already, there needs to be a group here to preserve % the iterator. \def\parseunicodedataIV#1#2#3{% \begingroup \count0="#1 % \loop \unless\ifnum\count0>"#2 % \global\XeTeXcharclass\count0=\csname XeTeXcharclass#3\endcsname \advance\count0 by 1 % \repeat \endgroup }% \readandparse{LineBreak}% % For |EastAsianWidth.txt|, action is only needed if the character has width % |F|, |H| or |W|. Once again there may be a range of characters to handle. \def\parseunicodedataIII#1..#2..#3\relax#4{% \ifnum 0% \if F\firsttoken#4\relax 1\fi \if H\firsttoken#4\relax 1\fi \if W\firsttoken#4\relax 1\fi >0 % \ifx\relax#2\relax \parseunicodedataIV{"#1}% \else \begingroup \count0="#1 % \loop \unless\ifnum\count0>"#2 % \parseunicodedataIV{\count0}% \advance\count0 by 1 % \repeat \endgroup \fi \fi }% % Only take action if a line breaking class was previously saved: that will % map to the correct class number. \def\parseunicodedataIV#1{% \ifcsname LB@\number#1\endcsname \global\XeTeXcharclass#1= \csname XeTeXcharclass\csname LB@\number#1\endcsname\endcsname \fi }% \readandparse{EastAsianWidth}% \endgroup