assembler/
glyph.rs

1//! Implement the `@...@` constructs in the source code.
2//!
3//! We use `@...@` to represent the characters that ther TX-2 supports
4//! but which Unicode does not.  For example, `@sub_A@` which
5//! represents a subscripted letter A.
6//!
7//! We use the word "glyph" to denote the characters in the source
8//! code.  This includes spacing characters (Tab, Backspace, Space but
9//! not carriage return) but not shift codes (COLOR BLACK, SUPER,
10//! NORMAL, SUB, COLOR RED) or special keys which would not appear in
11//! source code (WORD EXAM, LINE FEED DOWN, LINE FEED UP, LOWER CASE,
12//! UPPER CASE, STOP).
13//!
14//! We also do not include the NULLIFY character.  This would
15//! certainly have been used in the input to the TX-2 assembler ("M4")
16//! to indicate that a character on the paper tape was deleted (but
17//! punching out all the holes, setting all the bits to 1).  But this
18//! would have no role in the preparation of source code on a modern
19//! computer system (e.g. with a text editor) so we don't currently
20//! support this in the input.
21//!
22//! Due to the complexities of lexing and representing the TX-2's compound characters
23//! (see for example [section 6-2.3 of the Users Handbook, "RULES FOR SYMEX FORMATION"](https://archive.org/details/tx-2-users-handbook-nov-63/page/n158/mode/1up)) we might later include additinal glyphs to represent compound characters.
24//!
25//! The [`base::charset`] module deals with similar things, but this
26//! module deals with concerns that are unique to the assembler itself
27//! (that is, concerns which are not relevant to the implementation of
28//! the TX-2 emulator).
29use std::collections::HashMap;
30use std::error::Error;
31use std::fmt::{self, Debug, Display, Formatter, Write};
32use std::hash::Hash;
33use std::sync::OnceLock;
34
35use base::charset::{Script, subscript_char, superscript_char};
36
37/// Identifies a Unicode character or a `@...@` glyph in the input
38/// which does not correspond to something understood by the M4
39/// assembler.
40///
41/// We make an exception for characters that appear only in comments
42/// or annotations.
43#[derive(Debug, PartialEq, Eq, Clone)]
44pub(crate) enum Unrecognised {
45    /// A Unicode character is unknown on the TX-2.
46    InvalidChar(char),
47    /// `@foo@` was used but we did not recognise `foo` as the name of
48    /// a known glyph.
49    UnrecognisedGlyph(String),
50}
51
52impl Display for Unrecognised {
53    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
54        match self {
55            Unrecognised::InvalidChar(ch) => write!(
56                f,
57                "'{ch}' is not part of the TX-2 assembler's character set"
58            ),
59            Unrecognised::UnrecognisedGlyph(name) => {
60                write!(f, "'@{name}@' is not a recognised glyph name")
61            }
62        }
63    }
64}
65
66impl Error for Unrecognised {}
67
68/// Indicates the super/sub/normal script of something.
69///
70/// `Elevated<T>` indicates that a `T` appears in superscript,
71/// subscript or normal script.  This changes the meaning (and
72/// numerical value) of that item in the TX-2 assembly language.
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
74pub(crate) struct Elevated<T> {
75    inner: T,
76    script: Script,
77}
78
79impl<T> Elevated<T> {
80    pub(crate) fn script(&self) -> Script {
81        self.script
82    }
83
84    pub(crate) fn get(&self) -> &T {
85        &self.inner
86    }
87}
88
89trait AsStr {
90    fn as_str(&self) -> &str;
91}
92
93impl AsStr for &str {
94    fn as_str(&self) -> &str {
95        self
96    }
97}
98
99impl AsStr for String {
100    fn as_str(&self) -> &str {
101        self.as_str()
102    }
103}
104
105impl<T: AsStr> Display for Elevated<T> {
106    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
107        match self.script {
108            Script::Normal => write!(f, "{}", &self.inner.as_str()),
109            Script::Super => {
110                for ch in self.inner.as_str().chars() {
111                    match superscript_char(ch) {
112                        Ok(superchar) => {
113                            f.write_char(superchar)?;
114                        }
115                        Err(_) => match glyph_of_char(ch) {
116                            Ok(elevated_glyph) => {
117                                let glyph = elevated_glyph.get();
118                                if let Some(superchar) = glyph.superscript {
119                                    f.write_char(superchar)?;
120                                } else {
121                                    write!(f, "@sup_{}@", glyph.name)?;
122                                }
123                            }
124                            Err(_) => {
125                                unimplemented!("superscript variant of {ch}")
126                            }
127                        },
128                    }
129                }
130                Ok(())
131            }
132            Script::Sub => {
133                for ch in self.inner.as_str().chars() {
134                    match subscript_char(ch) {
135                        Ok(subchar) => {
136                            f.write_char(subchar)?;
137                        }
138                        Err(_) => match glyph_of_char(ch) {
139                            Ok(elevated_glyph) => {
140                                let glyph = elevated_glyph.get();
141                                if let Some(superchar) = glyph.superscript {
142                                    f.write_char(superchar)?;
143                                } else {
144                                    write!(f, "@sub_{}@", glyph.name)?;
145                                }
146                            }
147                            Err(_) => {
148                                unimplemented!("find subscript variant of {ch}")
149                            }
150                        },
151                    }
152                }
153                Ok(())
154            }
155        }
156    }
157}
158
159impl<T> From<(Script, T)> for Elevated<T> {
160    fn from((script, inner): (Script, T)) -> Elevated<T> {
161        Elevated { inner, script }
162    }
163}
164
165/// Create an instance of [`Elevated<T>`].
166pub(crate) fn elevate<T>(script: Script, inner: T) -> Elevated<T> {
167    Elevated { inner, script }
168}
169
170/// A character which might appear in source code.
171///
172/// We include mappings to Unicode representation where this exists.
173/// However, there are also cases where more than one Unicode
174/// character (in the assembler input) might get mapped to the same
175/// Glyph; see [`canonicalise_char`].
176#[derive(Debug, PartialEq, Eq)]
177pub(crate) struct Glyph {
178    /// Indicates the shape of the glyph without regard to its
179    /// (superscript/subscript/normal) position with respect to the
180    /// character baseline.
181    pub(crate) shape: GlyphShape,
182    /// The name of the glyph as we would use it inside `@...@`.
183    pub(crate) name: &'static str,
184    /// The Unicode representation of this glyph when in normal
185    /// script.
186    pub(crate) normal: Option<char>,
187    /// The Unicode representation of this glyph when in superscript.
188    pub(crate) superscript: Option<char>,
189    /// The Unicode representation of this glyph when in subscript.
190    pub(crate) subscript: Option<char>,
191    /// When advance is false, this glyph does not advance the Lincoln
192    /// Writer's print carriage.  This appears to be true for
193    /// character codes 0o12 (underbar, overbar) and 0o13 (circle,
194    /// square).  We should provide a reference for this, but just now
195    /// I'm taking this info from the code in base/src/charset.rs
196    /// which deals with these character codes.
197    ///
198    /// We try to use combining characters for these.
199    pub(crate) advance: bool,
200}
201
202impl Glyph {
203    pub(crate) fn shape(&self) -> GlyphShape {
204        self.shape
205    }
206
207    pub(crate) fn get_char(&self, script: Script) -> Option<char> {
208        match script {
209            Script::Normal => self.normal,
210            Script::Super => self.superscript,
211            Script::Sub => self.subscript,
212        }
213    }
214}
215
216#[test]
217fn test_subscript_char_agreement() {
218    for g in ALL_GLYPHS {
219        if let Some(ch) = g.normal
220            && let Some(glyph_sub_ch) = g.subscript
221            && let Ok(charset_sub_ch) = subscript_char(ch)
222        {
223            assert_eq!(
224                glyph_sub_ch,
225                charset_sub_ch,
226                "glyph {g:?} maps {ch} to {glyph_sub_ch} ({}) but subscript_char maps it to {charset_sub_ch} ({})",
227                glyph_sub_ch.escape_unicode(),
228                charset_sub_ch.escape_unicode(),
229            );
230        }
231    }
232}
233
234#[test]
235fn test_superscript_char_agreement() {
236    for g in ALL_GLYPHS {
237        if let Some(ch) = g.normal
238            && let Some(glyph_sup_ch) = g.superscript
239            && let Ok(charset_sup_ch) = superscript_char(ch)
240        {
241            assert_eq!(
242                glyph_sup_ch,
243                charset_sup_ch,
244                "glyph {g:?} maps {ch} to {glyph_sup_ch} ({}) but superscript_char maps it to {charset_sup_ch} ({})",
245                glyph_sup_ch.escape_unicode(),
246                charset_sup_ch.escape_unicode(),
247            );
248        }
249    }
250}
251
252#[test]
253fn test_glyph_names_do_not_contain_underscore() {
254    // Because sup_ and sub_ use an underscore as a kind of separator,
255    // it's probably too confusing to allow them in glyph names.  So
256    // this test prevents someone using one.
257    //
258    // I am currently also considering a convention in which a
259    // combining character is specified by including the name of the
260    // second character inside @...@.  For example @square_minus@ to
261    // denote a square (which does not advance the carriage) followed
262    // by a minus sign (which does).  This plan would require us to
263    // treat '_' as a glyph name separator.
264    for g in ALL_GLYPHS {
265        assert!(
266            !g.name.contains('_'),
267            "glyph name {} should not contain an underscore",
268            &g.name
269        );
270    }
271}
272
273// TODO: probably doesn't need to be a module.
274mod shape {
275    //! Used to limit effect of `allow(non_camel_case_types)`;
276    //! probably not needed.
277
278    /// Lincoln Writer character shapes.
279    ///
280    /// All character shapes in the character set table from page 2 of
281    /// the documentation on the Lincoln Writer channels (65, 66).
282    /// TX-2 Users Handbook, July 1961.
283    #[allow(non_camel_case_types)]
284    #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
285    pub(crate) enum GlyphShape {
286        Digit0,
287        Digit1,
288        Digit2,
289        Digit3,
290        Digit4,
291        Digit5,
292        Digit6,
293        Digit7,
294        Digit8,
295        Digit9,
296        Underscore,
297        Circle,
298        A,
299        B,
300        C,
301        D,
302        E,
303        F,
304        G,
305        H,
306        I,
307        J,
308        K,
309        L,
310        M,
311        N,
312        O,
313        P,
314        Q,
315        R,
316        S,
317        T,
318        U,
319        V,
320        W,
321        X,
322        Y,
323        Z,
324        LeftParen,
325        RightParen,
326        Add,
327        Minus,
328        Comma,
329        Dot,
330        // No CARRIAGE RETURN
331        Tab,
332        Backspace,
333        // No COLOR BLACK, SUPER, NORMAL, SUB, COLOR RED
334        Space,
335        // No WORD EXAM, LINE FEED DOWN, LINE FEED UP, LOWER CASE, UPPER
336        // CASE, STOP, NULLIFY.
337        Hand,
338        Sigma,
339        Pipe,
340        DoublePipe,
341        Solidus,
342        Times,
343        Hash,
344        Arrow,
345        LessThan,
346        GreaterThan,
347        Overbar,
348        Square,
349        n,
350        SubsetOf,
351        Or,
352        q,
353        Gamma,
354        t,
355        w,
356        x,
357        i,
358        y,
359        z,
360        Query,
361        Union,
362        Intersection,
363        j,
364        k,
365        Alpha,
366        Delta,
367        p,
368        Epsilon,
369        h,
370        SupersetOf,
371        Beta,
372        And,
373        Lambda,
374        Tilde,
375        LeftBrace,
376        RightBrace,
377        IdenticalTo, /* hamb */
378        Equals,
379        Apostrophe,
380        Asterisk,
381    }
382}
383pub(crate) use shape::GlyphShape;
384
385/// Indicates that a Unicode character does not exist in the TX-2 character set.
386#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
387pub(crate) struct NotInCharacterSet(pub char);
388
389impl Display for NotInCharacterSet {
390    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
391        write!(
392            f,
393            "Character '{}' is not in the TX-2's Lincoln Writer character set",
394            self.0
395        )
396    }
397}
398
399impl Error for NotInCharacterSet {}
400
401/// Convert a superscript/subscript/normal Unicode character, if we
402/// recognise it, into [`Elevated<&'static Glyph>`].
403pub(crate) fn glyph_of_char(original: char) -> Result<Elevated<&'static Glyph>, Unrecognised> {
404    let ch: char = canonicalise_char(original);
405    let mapping = glyph_map();
406    match mapping.get(ch) {
407        Some(elevated) => Ok(elevated),
408        None => Err(Unrecognised::InvalidChar(original)),
409    }
410}
411
412#[test]
413fn test_space_is_normal() {
414    match glyph_of_char(' ') {
415        Ok(elevated) => {
416            assert_eq!(elevated.script(), Script::Normal);
417        }
418        Err(e) => {
419            panic!("unexpected failure to look up space: {e:?}");
420        }
421    }
422}
423
424impl TryFrom<char> for Elevated<&'static Glyph> {
425    type Error = NotInCharacterSet;
426
427    fn try_from(ch: char) -> Result<Self, Self::Error> {
428        glyph_of_char(ch).map_err(|_| NotInCharacterSet(ch))
429    }
430}
431
432#[test]
433fn test_glyph_of_dot() {
434    assert_eq!(glyph_of_char('.'), glyph_of_char('·'));
435}
436
437//const fn code_point_of_shape(g: GlyphShape) -> (LwCase, Unsigned6Bit) {
438//     use base::charset::LwCase;
439//     use base::Pu6, Unsigned6Bit};
440//    // Information taken from the character set table from page 2 of
441//    // the documentation on the Lincoln Writer channels (65, 66).
442//    // TX-2 Users Handbook, July 1961.
443//    const L: LwCase = LwCase::Lower;
444//    const U: LwCase = LwCase::Upper;
445//    match g {
446//        GlyphShape::Digit0 => (L, u6!(0)),
447//        GlyphShape::Digit1 => (L, u6!(1)),
448//        GlyphShape::Digit2 => (L, u6!(2)),
449//        GlyphShape::Digit3 => (L, u6!(3)),
450//        GlyphShape::Digit4 => (L, u6!(4)),
451//        GlyphShape::Digit5 => (L, u6!(5)),
452//        GlyphShape::Digit6 => (L, u6!(6)),
453//        GlyphShape::Digit7 => (L, u6!(7)),
454//        GlyphShape::Digit8 => (L, u6!(0o10)),
455//        GlyphShape::Digit9 => (L, u6!(0o11)),
456//        GlyphShape::Underscore => (L, u6!(0o12)),
457//        GlyphShape::Circle => (L, u6!(0o13)),
458//        GlyphShape::A => (L, u6!(0o20)),
459//        GlyphShape::B => (L, u6!(0o21)),
460//        GlyphShape::C => (L, u6!(0o22)),
461//        GlyphShape::D => (L, u6!(0o23)),
462//        GlyphShape::E => (L, u6!(0o24)),
463//        GlyphShape::F => (L, u6!(0o25)),
464//        GlyphShape::G => (L, u6!(0o26)),
465//        GlyphShape::H => (L, u6!(0o27)),
466//        GlyphShape::I => (L, u6!(0o30)),
467//        GlyphShape::J => (L, u6!(0o31)),
468//        GlyphShape::K => (L, u6!(0o32)),
469//        GlyphShape::L => (L, u6!(0o33)),
470//        GlyphShape::M => (L, u6!(0o34)),
471//        GlyphShape::N => (L, u6!(0o35)),
472//        GlyphShape::O => (L, u6!(0o36)),
473//        GlyphShape::P => (L, u6!(0o37)),
474//        GlyphShape::Q => (L, u6!(0o40)),
475//        GlyphShape::R => (L, u6!(0o41)),
476//        GlyphShape::S => (L, u6!(0o42)),
477//        GlyphShape::T => (L, u6!(0o43)),
478//        GlyphShape::U => (L, u6!(0o44)),
479//        GlyphShape::V => (L, u6!(0o45)),
480//        GlyphShape::W => (L, u6!(0o46)),
481//        GlyphShape::X => (L, u6!(0o47)),
482//        GlyphShape::Y => (L, u6!(0o50)),
483//        GlyphShape::Z => (L, u6!(0o51)),
484//        GlyphShape::LeftParen => (L, u6!(0o52)),
485//        GlyphShape::RightParen => (L, u6!(0o53)),
486//        GlyphShape::Add => (L, u6!(0o54)),
487//        GlyphShape::Minus => (L, u6!(0o55)),
488//        GlyphShape::Comma => (L, u6!(0o56)),
489//        GlyphShape::Dot => (L, u6!(0o57)),
490//        GlyphShape::Tab => (L, u6!(0o61)),
491//        GlyphShape::Backspace => (L, u6!(0o62)),
492//        // 0o63 is COLOR BLACK
493//        //
494//        // 0o64 is SUPER
495//        //
496//        // 0o65 is NORMAL
497//        //
498//        // 0o66 is SUB
499//        //
500//        // 0o67 is COLOR RED
501//        GlyphShape::Space => (L, u6!(0o70)),
502//        // 0o71 is WORD EXAM
503//        //
504//        // 0o72 is LINE FEED DOWN
505//        //
506//        // 0o73 is LINE FEED UP
507//        //
508//        // 0o74 is LOWER CASE
509//        //
510//        // 0o75 is UPPER CASE
511//        //
512//        // 0o76 is STOP
513//        //
514//        // 0o77 is NULLIFY
515//        GlyphShape::Hand => (U, u6!(0)),
516//        GlyphShape::Sigma => (U, u6!(1)),
517//        GlyphShape::Pipe => (U, u6!(2)),
518//        GlyphShape::DoublePipe => (U, u6!(3)),
519//        GlyphShape::Solidus => (U, u6!(4)),
520//        GlyphShape::Times => (U, u6!(5)),
521//        GlyphShape::Hash => (U, u6!(6)),
522//        GlyphShape::Arrow => (U, u6!(7)),
523//        GlyphShape::LessThan => (U, u6!(0o10)),
524//        GlyphShape::GreaterThan => (U, u6!(0o11)),
525//        GlyphShape::Overbar => (U, u6!(0o12)),
526//        GlyphShape::Square => (U, u6!(0o13)),
527//        // 0o14 is "READ IN"
528//        //
529//        // 0o15 is "BEGIN"
530//        //
531//        // 0o16 is "NO"
532//        //
533//        // 0o17 is "YES"
534//        GlyphShape::n => (U, u6!(0o20)),
535//        GlyphShape::SubsetOf => (U, u6!(0o21)),
536//        GlyphShape::Or => (U, u6!(0o22)),
537//        GlyphShape::q => (U, u6!(0o23)),
538//        GlyphShape::Gamma => (U, u6!(0o24)),
539//        GlyphShape::t => (U, u6!(0o25)),
540//        GlyphShape::w => (U, u6!(0o26)),
541//        GlyphShape::x => (U, u6!(0o27)),
542//        GlyphShape::i => (U, u6!(0o30)),
543//        GlyphShape::y => (U, u6!(0o31)),
544//        GlyphShape::z => (U, u6!(0o32)),
545//        GlyphShape::Query => (U, u6!(0o33)),
546//        GlyphShape::Union => (U, u6!(0o34)),
547//        GlyphShape::Intersection => (U, u6!(0o35)),
548//        GlyphShape::j => (U, u6!(0o36)),
549//        GlyphShape::k => (U, u6!(0o37)),
550//        GlyphShape::Alpha => (U, u6!(0o40)),
551//        GlyphShape::Delta => (U, u6!(0o41)),
552//        GlyphShape::p => (U, u6!(0o42)),
553//        GlyphShape::Epsilon => (U, u6!(0o43)),
554//        GlyphShape::h => (U, u6!(0o44)),
555//        GlyphShape::SupersetOf => (U, u6!(0o45)),
556//        GlyphShape::Beta => (U, u6!(0o46)),
557//        GlyphShape::And => (U, u6!(0o47)),
558//        GlyphShape::Lambda => (U, u6!(0o50)),
559//        GlyphShape::Tilde => (U, u6!(0o51)),
560//        GlyphShape::LeftBrace => (U, u6!(0o52)),
561//        GlyphShape::RightBrace => (U, u6!(0o53)),
562//        GlyphShape::IdenticalTo => (U, u6!(0o54)), // @hamb@
563//        GlyphShape::Equals => (U, u6!(0o55)),
564//        GlyphShape::Apostrophe => (U, u6!(0o56)),
565//        GlyphShape::Asterisk => (U, u6!(0o57)),
566//        // Code points 0o60 to 0o77 are non-graphinc characters.
567//    }
568//}
569
570/// Used to save typing to provide defaults in the definitions in
571/// [`ALL_GLYPHS`].
572const GDEF: Glyph = Glyph {
573    shape: GlyphShape::Hand,
574    name: "",
575    normal: None,
576    superscript: None,
577    subscript: None,
578    advance: true,
579};
580
581/// Symbols understood by the M4 assembler (other than compound
582/// symbols).
583///
584/// Information taken from the character set table from page 2 of
585/// the documentation on the Lincoln Writer channels (65, 66).
586/// TX-2 Users Handbook, July 1961.
587const ALL_GLYPHS: &[Glyph] = &[
588    Glyph {
589        shape: GlyphShape::Digit0,
590        name: "0",
591        normal: Some('0'),
592        superscript: Some('⁰'),
593        subscript: Some('₀'),
594        ..GDEF
595    },
596    Glyph {
597        shape: GlyphShape::Digit1,
598        name: "1",
599        normal: Some('1'),
600        subscript: Some('₁'),
601        superscript: Some('¹'),
602        ..GDEF
603    },
604    Glyph {
605        shape: GlyphShape::Digit2,
606        name: "2",
607        normal: Some('2'),
608        subscript: Some('₂'),
609        superscript: Some('²'),
610        ..GDEF
611    },
612    Glyph {
613        shape: GlyphShape::Digit3,
614        name: "3",
615        normal: Some('3'),
616        subscript: Some('₃'),
617        superscript: Some('³'),
618        ..GDEF
619    },
620    Glyph {
621        shape: GlyphShape::Digit4,
622        name: "4",
623        normal: Some('4'),
624        subscript: Some('₄'),
625        superscript: Some('⁴'),
626        ..GDEF
627    },
628    Glyph {
629        shape: GlyphShape::Digit5,
630        name: "5",
631        normal: Some('5'),
632        subscript: Some('₅'),
633        superscript: Some('⁵'),
634        ..GDEF
635    },
636    Glyph {
637        shape: GlyphShape::Digit6,
638        name: "6",
639        normal: Some('6'),
640        subscript: Some('₆'),
641        superscript: Some('⁶'),
642        ..GDEF
643    },
644    Glyph {
645        shape: GlyphShape::Digit7,
646        name: "7",
647        normal: Some('7'),
648        subscript: Some('₇'),
649        superscript: Some('⁷'),
650        ..GDEF
651    },
652    Glyph {
653        shape: GlyphShape::Digit8,
654        name: "8",
655        normal: Some('8'),
656        subscript: Some('₈'),
657        superscript: Some('⁸'),
658        ..GDEF
659    },
660    Glyph {
661        shape: GlyphShape::Digit9,
662        name: "9",
663        normal: Some('9'),
664        subscript: Some('₉'),
665        superscript: Some('⁹'),
666        ..GDEF
667    },
668    Glyph {
669        shape: GlyphShape::Underscore,
670        name: "underscore",
671        // This character does not advance the carriage, so instead of
672        // representing it with ASCII \x5F (underscore) we use a
673        // combining low line.
674        normal: Some('\u{0332}'), // U+0332, combining low line
675        advance: false,
676        ..GDEF
677    },
678    Glyph {
679        shape: GlyphShape::Circle,
680        name: "circle",
681        // U+25CB, Unicode white circle, '○', advances the cursor
682        // position, which the Lincoln Writer code (0o13) doesn't do.
683        // So we use a combining character.
684        normal: Some('\u{20DD}'), // U+20DD, combining enclosing circle
685        advance: false,
686        ..GDEF
687    },
688    // 0o14 is "READ IN"
689    //
690    // 0o15 is "BEGIN"
691    //
692    // 0o16 is "NO"
693    //
694    // 0o17 is "YES"
695    Glyph {
696        shape: GlyphShape::A,
697        name: "A",
698        normal: Some('A'),
699        superscript: Some('ᴬ'),
700        ..GDEF
701    },
702    Glyph {
703        shape: GlyphShape::B,
704        name: "B",
705        normal: Some('B'),
706        superscript: Some('ᴮ'),
707        ..GDEF
708    },
709    Glyph {
710        shape: GlyphShape::C,
711        name: "C",
712        normal: Some('C'),
713        superscript: Some('ꟲ'), // U+A7F2 (we don't use U+1D9C, that's the lower-case C)
714        ..GDEF
715    },
716    Glyph {
717        shape: GlyphShape::D,
718        name: "D",
719        normal: Some('D'),
720        superscript: Some('ᴰ'),
721        ..GDEF
722    },
723    Glyph {
724        shape: GlyphShape::E,
725        name: "E",
726        normal: Some('E'),
727        superscript: Some('ᴱ'),
728        ..GDEF
729    },
730    Glyph {
731        shape: GlyphShape::F,
732        name: "F",
733        normal: Some('F'),
734        superscript: Some('ꟳ'),
735        ..GDEF
736    },
737    Glyph {
738        shape: GlyphShape::G,
739        name: "G",
740        normal: Some('G'),
741        superscript: Some('ᴳ'),
742        ..GDEF
743    },
744    Glyph {
745        shape: GlyphShape::H,
746        name: "H",
747        normal: Some('H'),
748        superscript: Some('ᴴ'),
749        ..GDEF
750    },
751    Glyph {
752        shape: GlyphShape::I,
753        name: "I",
754        normal: Some('I'),
755        superscript: Some('ᴵ'),
756        ..GDEF
757    },
758    Glyph {
759        shape: GlyphShape::J,
760        name: "J",
761        normal: Some('J'),
762        superscript: Some('ᴶ'),
763        ..GDEF
764    },
765    Glyph {
766        shape: GlyphShape::K,
767        name: "K",
768        normal: Some('K'),
769        superscript: Some('ᴷ'),
770        ..GDEF
771    },
772    Glyph {
773        shape: GlyphShape::L,
774        name: "L",
775        normal: Some('L'),
776        superscript: Some('ᴸ'),
777        ..GDEF
778    },
779    Glyph {
780        shape: GlyphShape::M,
781        name: "M",
782        normal: Some('M'),
783        superscript: Some('ᴹ'),
784        ..GDEF
785    },
786    Glyph {
787        shape: GlyphShape::N,
788        name: "N",
789        normal: Some('N'),
790        superscript: Some('ᴺ'),
791        ..GDEF
792    },
793    Glyph {
794        shape: GlyphShape::O,
795        name: "O",
796        normal: Some('O'),
797        superscript: Some('ᴼ'),
798        ..GDEF
799    },
800    Glyph {
801        shape: GlyphShape::P,
802        name: "P",
803        normal: Some('P'),
804        superscript: Some('ᴾ'),
805        ..GDEF
806    },
807    Glyph {
808        shape: GlyphShape::Q,
809        name: "Q",
810        normal: Some('Q'),
811        superscript: Some('ꟴ'),
812        ..GDEF
813    },
814    Glyph {
815        shape: GlyphShape::R,
816        name: "R",
817        normal: Some('R'),
818        superscript: Some('ᴿ'),
819        ..GDEF
820    },
821    Glyph {
822        shape: GlyphShape::S,
823        name: "S",
824        normal: Some('S'),
825        // There is no Unicode superscript 'S', U+2E2 is a superscript 's'.
826        superscript: None,
827        ..GDEF
828    },
829    Glyph {
830        shape: GlyphShape::T,
831        name: "T",
832        normal: Some('T'),
833        superscript: Some('ᵀ'),
834        ..GDEF
835    },
836    Glyph {
837        shape: GlyphShape::U,
838        name: "U",
839        normal: Some('U'),
840        superscript: Some('ᵁ'),
841        ..GDEF
842    },
843    Glyph {
844        shape: GlyphShape::V,
845        name: "V",
846        normal: Some('V'),
847        superscript: Some('ⱽ'),
848        ..GDEF
849    },
850    Glyph {
851        shape: GlyphShape::W,
852        name: "W",
853        normal: Some('W'),
854        superscript: Some('ᵂ'),
855        ..GDEF
856    },
857    Glyph {
858        shape: GlyphShape::X,
859        name: "X",
860        normal: Some('X'),
861        // There is no superscript X in Unicode.
862        ..GDEF
863    },
864    Glyph {
865        shape: GlyphShape::Y,
866        name: "Y",
867        normal: Some('Y'),
868        // There is no superscript Y in Unicode.
869        ..GDEF
870    },
871    Glyph {
872        shape: GlyphShape::Z,
873        name: "Z",
874        normal: Some('Z'),
875        // There is no superscript Z in Unicode.
876        ..GDEF
877    },
878    Glyph {
879        shape: GlyphShape::LeftParen,
880        name: "lparen",
881        normal: Some('('),
882        subscript: Some('₍'),
883        ..GDEF
884    },
885    Glyph {
886        shape: GlyphShape::RightParen,
887        name: "rparen",
888        normal: Some(')'),
889        subscript: Some('₎'),
890        ..GDEF
891    },
892    Glyph {
893        shape: GlyphShape::Add,
894        name: "add", // following sub.py
895        normal: Some('+'),
896        superscript: Some('⁺'),
897        subscript: Some('₊'),
898        ..GDEF
899    },
900    Glyph {
901        shape: GlyphShape::Minus,
902        name: "minus", // following sub.py
903        normal: Some('-'),
904        superscript: Some('⁻'),
905        subscript: Some('₋'),
906        ..GDEF
907    },
908    Glyph {
909        shape: GlyphShape::Comma,
910        name: "comma",
911        normal: Some(','),
912        ..GDEF
913    },
914    Glyph {
915        shape: GlyphShape::Dot,
916        name: "dot",
917        // This is a centre dot, not a period.  We use a centre dot so
918        // that it's not confused with a subscript dot.
919        normal: Some('\u{00B7}'), // ·
920
921        // Using an ASCII full stop / period (".") would be too
922        // confusing for the user, who (when preparing source code
923        // input) might expect this to be interpreted as the
924        // normal-script PERIOD.  So for subscript we instead use
925        // U+2024, "One Dot Leader".
926        subscript: Some('\u{2024}'), // "․" (not ASCII ".")
927        superscript: None,
928        ..GDEF
929    },
930    // CARRIAGE RETURN is missing.
931    Glyph {
932        shape: GlyphShape::Tab,
933        name: "tab",
934        normal: Some('\t'),
935        ..GDEF
936    },
937    Glyph {
938        // backspace is used in some combining-character symexes.
939        shape: GlyphShape::Backspace,
940        name: "backspace",
941        normal: None, // better to say @backspace@.
942        ..GDEF
943    },
944    // 0o63 is COLOR BLACK
945    //
946    // 0o64 is SUPER
947    //
948    // 0o65 is NORMAL
949    //
950    // 0o66 is SUB
951    //
952    // 0o67 is COLOR RED
953    Glyph {
954        shape: GlyphShape::Space,
955        name: "space",
956        normal: Some(' '),
957        subscript: Some(' '),
958        superscript: Some(' '),
959        ..GDEF
960    },
961    // 0o71 is WORD EXAM
962    //
963    // 0o72 is LINE FEED DOWN
964    //
965    // 0o73 is LINE FEED UP
966    //
967    // 0o74 is LOWER CASE
968    //
969    // 0o75 is UPPER CASE
970    //
971    // 0o76 is STOP
972    //
973    // 0o77 is NULLIFY
974    //
975    //
976    // Right-hand column of the character set table follows.
977    Glyph {
978        shape: GlyphShape::Hand,
979        name: "hand",
980        normal: Some('☛'), // U+261B
981        ..GDEF
982    },
983    Glyph {
984        shape: GlyphShape::Sigma,
985        name: "sigma",
986        normal: Some('Σ'), // U+03A3
987        ..GDEF
988    },
989    Glyph {
990        shape: GlyphShape::Pipe,
991        name: "pipe",
992        normal: Some('|'),
993        ..GDEF
994    },
995    Glyph {
996        shape: GlyphShape::DoublePipe,
997        name: "doublepipe",
998        normal: Some('‖'),
999        ..GDEF
1000    },
1001    Glyph {
1002        shape: GlyphShape::Solidus,
1003        name: "solidus", // better known as "slash".
1004        normal: Some('/'),
1005        ..GDEF
1006    },
1007    Glyph {
1008        shape: GlyphShape::Times,
1009        name: "times",
1010        normal: Some('×'),
1011        ..GDEF
1012    },
1013    Glyph {
1014        shape: GlyphShape::Hash,
1015        name: "hash",
1016        normal: Some('#'),
1017        ..GDEF
1018    },
1019    Glyph {
1020        shape: GlyphShape::Arrow,
1021        // arr not arrow to follow Jurij's sub.py
1022        name: "arr",
1023        normal: Some('\u{2192}'), // →
1024        ..GDEF
1025    },
1026    Glyph {
1027        shape: GlyphShape::LessThan,
1028        name: "lessthan",
1029        normal: Some('<'),
1030        ..GDEF
1031    },
1032    Glyph {
1033        shape: GlyphShape::GreaterThan,
1034        name: "greaterthan",
1035        normal: Some('>'),
1036        ..GDEF
1037    },
1038    Glyph {
1039        shape: GlyphShape::Overbar,
1040        name: "overbar",
1041        // This character does not advance the carriage, so we use a
1042        // combining character for it.
1043        normal: Some('\u{0305}'), // U+0305, combining overline
1044        superscript: None,
1045        subscript: None,
1046        advance: false,
1047    },
1048    Glyph {
1049        shape: GlyphShape::Square,
1050        name: "square",
1051        // This character does not advance the carriage, so instead of
1052        // using a character like U+25A1 ('□'), we use a combining
1053        // character.
1054        normal: Some('\u{20DE}'), // U+20DE, combining enclosing square
1055        subscript: None,
1056        superscript: None,
1057        advance: false,
1058    },
1059    // 0o14 is "READ IN"
1060    //
1061    // 0o15 is "BEGIN"
1062    //
1063    // 0o16 is "NO"
1064    //
1065    // 0o17 is "YES"
1066    Glyph {
1067        shape: GlyphShape::n,
1068        name: "n",
1069        normal: Some('n'),
1070        superscript: Some('ⁿ'),
1071        subscript: Some('ₙ'), // U+2099
1072        ..GDEF
1073    },
1074    Glyph {
1075        shape: GlyphShape::SubsetOf,
1076        name: "subsetof",
1077        normal: Some('\u{2282}'), // Subset of, ⊂
1078        ..GDEF
1079    },
1080    Glyph {
1081        shape: GlyphShape::Or,
1082        name: "or",
1083        normal: Some('∨'),
1084        ..GDEF
1085    },
1086    Glyph {
1087        shape: GlyphShape::q,
1088        name: "q",
1089        normal: Some('q'),
1090        superscript: None,
1091        // U+107A5 is a subscript q, but this is not widely supported,
1092        // so we don't use it.  Instead the user should use "@sub_q@".
1093        subscript: None,
1094        ..GDEF
1095    },
1096    Glyph {
1097        shape: GlyphShape::Gamma,
1098        name: "gamma",
1099        normal: Some('γ'), // U+03B3, Greek small letter gamma
1100        superscript: Some('ᵞ'),
1101        subscript: Some('ᵧ'),
1102        ..GDEF
1103    },
1104    Glyph {
1105        shape: GlyphShape::t,
1106        name: "t",
1107        normal: Some('t'),
1108        superscript: Some('ᵗ'), // U+1D57
1109        subscript: Some('ₜ'),   // U+209C
1110        ..GDEF
1111    },
1112    Glyph {
1113        shape: GlyphShape::w,
1114        name: "w",
1115        normal: Some('w'),
1116        superscript: Some('ʷ'),
1117        subscript: None,
1118        ..GDEF
1119    },
1120    Glyph {
1121        shape: GlyphShape::x,
1122        name: "x",
1123        normal: Some('x'),
1124        superscript: Some('ˣ'),
1125        subscript: Some('ₓ'), // U+2093
1126        ..GDEF
1127    },
1128    Glyph {
1129        shape: GlyphShape::i,
1130        name: "i",
1131        normal: Some('i'),
1132        superscript: Some('ⁱ'),
1133        subscript: Some('ᵢ'),
1134        ..GDEF
1135    },
1136    Glyph {
1137        shape: GlyphShape::y,
1138        name: "y",
1139        normal: Some('y'),
1140        superscript: Some('ʸ'),
1141        subscript: None,
1142        ..GDEF
1143    },
1144    Glyph {
1145        shape: GlyphShape::z,
1146        name: "z",
1147        normal: Some('z'),
1148        subscript: None,
1149        superscript: Some('ᶻ'),
1150        ..GDEF
1151    },
1152    Glyph {
1153        shape: GlyphShape::Query, // A question mark.
1154        name: "?",
1155        normal: Some('?'),
1156        superscript: Some('ˀ'), // dot is missing but it's the best we can do.
1157        // U+FE56, "Small Question Mark" is not really a subscript
1158        // character, but let's try it out.
1159        subscript: Some('﹖'),
1160        ..GDEF
1161    },
1162    Glyph {
1163        shape: GlyphShape::Union,
1164        name: "union",
1165        normal: Some('∪'),
1166        superscript: None,
1167        subscript: None,
1168        ..GDEF
1169    },
1170    Glyph {
1171        shape: GlyphShape::Intersection,
1172        name: "intersection",
1173        normal: Some('\u{2229}'),
1174        subscript: None,
1175        superscript: None,
1176        ..GDEF
1177    },
1178    Glyph {
1179        shape: GlyphShape::j,
1180        name: "j",
1181        normal: Some('j'),
1182        superscript: Some('ʲ'), // U+02B2
1183        subscript: Some('ⱼ'),   // U+2C7C
1184        ..GDEF
1185    },
1186    Glyph {
1187        shape: GlyphShape::k,
1188        name: "k",
1189        normal: Some('k'),
1190        superscript: Some('ᵏ'),
1191        subscript: Some('ₖ'), // U+2096
1192        ..GDEF
1193    },
1194    Glyph {
1195        shape: GlyphShape::Alpha,
1196        name: "alpha",
1197        normal: Some('α'), // U+03B1, alpha
1198        // this is actually a Latin superscript alpha, but it will normally look the same.
1199        superscript: Some('ᵅ'),
1200        subscript: None,
1201        ..GDEF
1202    },
1203    Glyph {
1204        shape: GlyphShape::Delta,
1205        name: "delta",
1206        normal: Some('Δ'), // U+0395, capital delta
1207        ..GDEF
1208    },
1209    Glyph {
1210        shape: GlyphShape::p,
1211        name: "p",
1212        normal: Some('p'),
1213        superscript: Some('ᵖ'),
1214        subscript: Some('ₚ'), // U+209A
1215        ..GDEF
1216    },
1217    Glyph {
1218        shape: GlyphShape::Epsilon,
1219        name: "eps",
1220        normal: Some('ε'),      // U+03B5, Epsilon (not ∈, Element of)
1221        superscript: Some('ᵋ'), // U+1D4B
1222        subscript: None,
1223        ..GDEF
1224    },
1225    Glyph {
1226        shape: GlyphShape::h,
1227        name: "h",
1228        normal: Some('h'),
1229        superscript: Some('ʰ'),
1230        subscript: Some('ₕ'),
1231        ..GDEF
1232    },
1233    Glyph {
1234        shape: GlyphShape::SupersetOf,
1235        name: "sup",       // name aligns with Jurij's sub.py
1236        normal: Some('⊃'), // U+2283, superset of
1237        superscript: None,
1238        subscript: None,
1239        ..GDEF
1240    },
1241    Glyph {
1242        shape: GlyphShape::Beta,
1243        name: "beta",
1244        normal: Some('β'),      // U+03B2, Greek beta symbol
1245        superscript: Some('ᵝ'), // U+1D5D
1246        subscript: Some('ᵦ'),   // U+1D66
1247        ..GDEF
1248    },
1249    Glyph {
1250        shape: GlyphShape::And,
1251        name: "and",
1252        normal: Some('∧'), // U+2227, Logical And
1253        superscript: None,
1254        subscript: None,
1255        ..GDEF
1256    },
1257    Glyph {
1258        shape: GlyphShape::Lambda,
1259        name: "lambda",
1260        normal: Some('λ'), // U+3BB, Greek letter lambda
1261        superscript: None,
1262        subscript: None,
1263        ..GDEF
1264    },
1265    Glyph {
1266        shape: GlyphShape::Tilde,
1267        name: "tilde",
1268        normal: Some('~'),
1269        ..GDEF
1270    },
1271    Glyph {
1272        shape: GlyphShape::LeftBrace,
1273        name: "leftbrace",
1274        normal: Some('{'),
1275        ..GDEF
1276    },
1277    Glyph {
1278        shape: GlyphShape::RightBrace,
1279        name: "rightbrace",
1280        normal: Some('}'),
1281        ..GDEF
1282    },
1283    Glyph {
1284        shape: GlyphShape::IdenticalTo,
1285        name: "hamb",      // following Jurij's sub.py
1286        normal: Some('≡'), // U+2261, Identical to (Jurij used ☰, U+2630, Trigram For Heaven)
1287        ..GDEF
1288    },
1289    Glyph {
1290        shape: GlyphShape::Equals,
1291        name: "equals",
1292        normal: Some('='),
1293        subscript: Some('₌'),
1294        ..GDEF
1295    },
1296    Glyph {
1297        shape: GlyphShape::Apostrophe,
1298        name: "apostrophe",
1299        normal: Some('\''),
1300        ..GDEF
1301    },
1302    Glyph {
1303        shape: GlyphShape::Asterisk,
1304        name: "asterisk",
1305        normal: Some('*'),
1306        subscript: None,
1307        superscript: None,
1308        ..GDEF
1309    },
1310    // Code points 0o60 to 0o77 are non-graphinc characters.
1311];
1312
1313/// Maps Unicode characters onto [`Glyph`] instances describing them.
1314#[derive(Debug, Clone, PartialEq, Eq)]
1315pub(crate) struct GlyphMapByChar {
1316    mapping: HashMap<char, Elevated<&'static Glyph>>,
1317}
1318
1319/// Read-only shared instance of [`GlyphMapByChar`].
1320static GLYPH_MAP_BY_CHAR: OnceLock<GlyphMapByChar> = OnceLock::new();
1321
1322impl Default for GlyphMapByChar {
1323    fn default() -> Self {
1324        let mut mapping = HashMap::new();
1325        for g in ALL_GLYPHS {
1326            for script in [Script::Sub, Script::Super, Script::Normal] {
1327                if g.normal == Some(' ') && script != Script::Normal {
1328                    // Note that the space character has the same
1329                    // representation in normal script, superscript
1330                    // and subscript.  We have a convention that space
1331                    // is always deemed to be in normal script.
1332                    continue;
1333                }
1334                if let Some(key) = g.get_char(script) {
1335                    let value = elevate(script, g);
1336                    if let Some(prev) = mapping.insert(key, value) {
1337                        panic!("duplicate glyph mapping for character '{key}': {g:?} and {prev:?}");
1338                    }
1339                }
1340            }
1341        }
1342        Self { mapping }
1343    }
1344}
1345
1346impl GlyphMapByChar {
1347    fn get(&self, ch: char) -> Option<Elevated<&'static Glyph>> {
1348        self.mapping.get(&ch).copied()
1349    }
1350}
1351
1352/// Return a reference to the shared instance of [`GlyphMapByChar`].
1353pub(crate) fn glyph_map() -> &'static GlyphMapByChar {
1354    GLYPH_MAP_BY_CHAR.get_or_init(GlyphMapByChar::default)
1355}
1356
1357/// Additional mappings of Unicode input to prevent user confusion.
1358///
1359/// We use the centre dot ("·", U+00B7) as a decimal point, but also
1360/// accept "." U+002E, because the latter is likely to be a common
1361/// choice.
1362///
1363/// We also provide ":" as a synonym for "h" in setting the hold bit
1364/// in order to accept source code from earlier papers which used the
1365/// older convention, such as H. Philip Peterson's "[Some Examples of
1366/// TX-2
1367/// Programming](http://www.bitsavers.org/pdf/mit/tx-2/6M-5780_Some_Examples_of_TX-2_Programming_Jul1958.pdf)"
1368/// (Lincoln Lab memo 6M-5780, 23 July 1958).
1369fn canonicalise_char(ch: char) -> char {
1370    match ch {
1371        // We don't convert U+A7F2 (ꟲ) to U+1D9C because the former is
1372        // a majuscule (captial) letter and the latter is a minuscule
1373        // (lower-case) letter.
1374        '.' => '\u{00B7}', // . -> ·
1375
1376        // The TX-2 character set doesn't include ':', but some of the
1377        // older sources use ':' to signal that the hold bit should be
1378        // set in an instruction.  In the Users Handbook (in 1961 at
1379        // least) this function is performed by 'h'.
1380        ':' => 'h',
1381        _ => ch,
1382    }
1383}
1384
1385/// Convert a Unicode character into its `@...@` synonym.
1386pub(crate) fn name_from_glyph(mut ch: char) -> Option<&'static str> {
1387    // TODO: do we need both this and glyph_from_name?
1388    ch = canonicalise_char(ch);
1389    ALL_GLYPHS
1390        .iter()
1391        .find(|g| g.normal == Some(ch))
1392        .map(|g| g.name)
1393}
1394
1395/// Convert a Unicode string into [`Elevated<&'static Glyph>`].
1396///
1397/// Return `None` if the name of the glyph is not recognised.
1398pub(crate) fn glyph_from_name(name: &str) -> Option<Elevated<&'static Glyph>> {
1399    let (script, glyph_base_name) = if let Some(suffix) = name.strip_prefix("sub_") {
1400        (Script::Sub, suffix)
1401    } else if let Some(suffix) = name.strip_prefix("sup_") {
1402        (Script::Super, suffix)
1403    } else {
1404        (Script::Normal, name)
1405    };
1406    ALL_GLYPHS
1407        .iter()
1408        .find(|g| g.name == glyph_base_name)
1409        .map(|g| elevate(script, g))
1410}
1411
1412/// Return true if this character is allowed in a symex (symbol name).
1413///
1414/// Specified in Users Handbook section 6-2.3 item 6.
1415pub(crate) fn is_allowed_in_symex(g: GlyphShape) -> bool {
1416    match g {
1417        // Eeasier to understand if we don't re-order the match arms.
1418        #![allow(clippy::match_same_arms)]
1419        GlyphShape::Digit0 |
1420        GlyphShape::Digit1 |
1421        GlyphShape::Digit2 |
1422        GlyphShape::Digit3 |
1423        GlyphShape::Digit4 |
1424        GlyphShape::Digit5 |
1425        GlyphShape::Digit6 |
1426        GlyphShape::Digit7 |
1427        GlyphShape::Digit8 |
1428        GlyphShape::Digit9 |
1429        GlyphShape::A |
1430        GlyphShape::B |
1431        GlyphShape::C |
1432        GlyphShape::D |
1433        GlyphShape::E |
1434        GlyphShape::F |
1435        GlyphShape::G |
1436        GlyphShape::H |
1437        GlyphShape::I |
1438        GlyphShape::J |
1439        GlyphShape::K |
1440        GlyphShape::L |
1441        GlyphShape::M |
1442        GlyphShape::N |
1443        GlyphShape::O |
1444        GlyphShape::P |
1445        GlyphShape::Q |
1446        GlyphShape::R |
1447        GlyphShape::S |
1448        GlyphShape::T |
1449        GlyphShape::U |
1450        GlyphShape::V |
1451        GlyphShape::W |
1452        GlyphShape::X |
1453        GlyphShape::Y |
1454        GlyphShape::Z |
1455        GlyphShape::Alpha|
1456        GlyphShape::Beta |
1457        GlyphShape::Gamma |
1458        GlyphShape::Delta  |
1459        GlyphShape::Epsilon |
1460        GlyphShape::Lambda |
1461        // Note: h is not allowed.
1462        GlyphShape::i |
1463        GlyphShape::j |
1464        GlyphShape::k |
1465        GlyphShape::n |
1466        GlyphShape::p |
1467        GlyphShape::q |
1468        GlyphShape::t |
1469        GlyphShape::w |
1470        GlyphShape::x |
1471        GlyphShape::y |
1472        GlyphShape::z |
1473        GlyphShape::Dot |
1474        GlyphShape::Apostrophe |
1475        GlyphShape::Underscore |
1476        GlyphShape::Overbar |
1477        GlyphShape::Square |
1478        GlyphShape::Circle => true,
1479        GlyphShape::Space => {
1480            // Space bar is allowed in a symex, per section 6-2.3.
1481            // But that doesn't necessarily mean that other space
1482            // characters are.  However, we treat space and tab the
1483            // same, and don't include them in the symex syllable
1484            // token (instead we join symex syllables together in the
1485            // parser).
1486            true
1487        }
1488        _ => false,
1489    }
1490}