assembler/glyph.rs
1//! Implement the `@...@` constructs in the source code.
2//!
3//! We use `@...@` to represent the characters that ther TX-2 supports
4//! but which Unicode does not. For example, `@sub_A@` which
5//! represents a subscripted letter A.
6//!
7//! We use the word "glyph" to denote the characters in the source
8//! code. This includes spacing characters (Tab, Backspace, Space but
9//! not carriage return) but not shift codes (COLOR BLACK, SUPER,
10//! NORMAL, SUB, COLOR RED) or special keys which would not appear in
11//! source code (WORD EXAM, LINE FEED DOWN, LINE FEED UP, LOWER CASE,
12//! UPPER CASE, STOP).
13//!
14//! We also do not include the NULLIFY character. This would
15//! certainly have been used in the input to the TX-2 assembler ("M4")
16//! to indicate that a character on the paper tape was deleted (but
17//! punching out all the holes, setting all the bits to 1). But this
18//! would have no role in the preparation of source code on a modern
19//! computer system (e.g. with a text editor) so we don't currently
20//! support this in the input.
21//!
22//! Due to the complexities of lexing and representing the TX-2's compound characters
23//! (see for example [section 6-2.3 of the Users Handbook, "RULES FOR SYMEX FORMATION"](https://archive.org/details/tx-2-users-handbook-nov-63/page/n158/mode/1up)) we might later include additinal glyphs to represent compound characters.
24//!
25//! The [`base::charset`] module deals with similar things, but this
26//! module deals with concerns that are unique to the assembler itself
27//! (that is, concerns which are not relevant to the implementation of
28//! the TX-2 emulator).
29use std::collections::HashMap;
30use std::error::Error;
31use std::fmt::{self, Debug, Display, Formatter, Write};
32use std::hash::Hash;
33use std::sync::OnceLock;
34
35use base::charset::{Script, subscript_char, superscript_char};
36
37/// Identifies a Unicode character or a `@...@` glyph in the input
38/// which does not correspond to something understood by the M4
39/// assembler.
40///
41/// We make an exception for characters that appear only in comments
42/// or annotations.
43#[derive(Debug, PartialEq, Eq, Clone)]
44pub(crate) enum Unrecognised {
45 /// A Unicode character is unknown on the TX-2.
46 InvalidChar(char),
47 /// `@foo@` was used but we did not recognise `foo` as the name of
48 /// a known glyph.
49 UnrecognisedGlyph(String),
50}
51
52impl Display for Unrecognised {
53 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
54 match self {
55 Unrecognised::InvalidChar(ch) => write!(
56 f,
57 "'{ch}' is not part of the TX-2 assembler's character set"
58 ),
59 Unrecognised::UnrecognisedGlyph(name) => {
60 write!(f, "'@{name}@' is not a recognised glyph name")
61 }
62 }
63 }
64}
65
66impl Error for Unrecognised {}
67
68/// Indicates the super/sub/normal script of something.
69///
70/// `Elevated<T>` indicates that a `T` appears in superscript,
71/// subscript or normal script. This changes the meaning (and
72/// numerical value) of that item in the TX-2 assembly language.
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
74pub(crate) struct Elevated<T> {
75 inner: T,
76 script: Script,
77}
78
79impl<T> Elevated<T> {
80 pub(crate) fn script(&self) -> Script {
81 self.script
82 }
83
84 pub(crate) fn get(&self) -> &T {
85 &self.inner
86 }
87}
88
89trait AsStr {
90 fn as_str(&self) -> &str;
91}
92
93impl AsStr for &str {
94 fn as_str(&self) -> &str {
95 self
96 }
97}
98
99impl AsStr for String {
100 fn as_str(&self) -> &str {
101 self.as_str()
102 }
103}
104
105impl<T: AsStr> Display for Elevated<T> {
106 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
107 match self.script {
108 Script::Normal => write!(f, "{}", &self.inner.as_str()),
109 Script::Super => {
110 for ch in self.inner.as_str().chars() {
111 match superscript_char(ch) {
112 Ok(superchar) => {
113 f.write_char(superchar)?;
114 }
115 Err(_) => match glyph_of_char(ch) {
116 Ok(elevated_glyph) => {
117 let glyph = elevated_glyph.get();
118 if let Some(superchar) = glyph.superscript {
119 f.write_char(superchar)?;
120 } else {
121 write!(f, "@sup_{}@", glyph.name)?;
122 }
123 }
124 Err(_) => {
125 unimplemented!("superscript variant of {ch}")
126 }
127 },
128 }
129 }
130 Ok(())
131 }
132 Script::Sub => {
133 for ch in self.inner.as_str().chars() {
134 match subscript_char(ch) {
135 Ok(subchar) => {
136 f.write_char(subchar)?;
137 }
138 Err(_) => match glyph_of_char(ch) {
139 Ok(elevated_glyph) => {
140 let glyph = elevated_glyph.get();
141 if let Some(superchar) = glyph.superscript {
142 f.write_char(superchar)?;
143 } else {
144 write!(f, "@sub_{}@", glyph.name)?;
145 }
146 }
147 Err(_) => {
148 unimplemented!("find subscript variant of {ch}")
149 }
150 },
151 }
152 }
153 Ok(())
154 }
155 }
156 }
157}
158
159impl<T> From<(Script, T)> for Elevated<T> {
160 fn from((script, inner): (Script, T)) -> Elevated<T> {
161 Elevated { inner, script }
162 }
163}
164
165/// Create an instance of [`Elevated<T>`].
166pub(crate) fn elevate<T>(script: Script, inner: T) -> Elevated<T> {
167 Elevated { inner, script }
168}
169
170/// A character which might appear in source code.
171///
172/// We include mappings to Unicode representation where this exists.
173/// However, there are also cases where more than one Unicode
174/// character (in the assembler input) might get mapped to the same
175/// Glyph; see [`canonicalise_char`].
176#[derive(Debug, PartialEq, Eq)]
177pub(crate) struct Glyph {
178 /// Indicates the shape of the glyph without regard to its
179 /// (superscript/subscript/normal) position with respect to the
180 /// character baseline.
181 pub(crate) shape: GlyphShape,
182 /// The name of the glyph as we would use it inside `@...@`.
183 pub(crate) name: &'static str,
184 /// The Unicode representation of this glyph when in normal
185 /// script.
186 pub(crate) normal: Option<char>,
187 /// The Unicode representation of this glyph when in superscript.
188 pub(crate) superscript: Option<char>,
189 /// The Unicode representation of this glyph when in subscript.
190 pub(crate) subscript: Option<char>,
191 /// When advance is false, this glyph does not advance the Lincoln
192 /// Writer's print carriage. This appears to be true for
193 /// character codes 0o12 (underbar, overbar) and 0o13 (circle,
194 /// square). We should provide a reference for this, but just now
195 /// I'm taking this info from the code in base/src/charset.rs
196 /// which deals with these character codes.
197 ///
198 /// We try to use combining characters for these.
199 pub(crate) advance: bool,
200}
201
202impl Glyph {
203 pub(crate) fn shape(&self) -> GlyphShape {
204 self.shape
205 }
206
207 pub(crate) fn get_char(&self, script: Script) -> Option<char> {
208 match script {
209 Script::Normal => self.normal,
210 Script::Super => self.superscript,
211 Script::Sub => self.subscript,
212 }
213 }
214}
215
216#[test]
217fn test_subscript_char_agreement() {
218 for g in ALL_GLYPHS {
219 if let Some(ch) = g.normal
220 && let Some(glyph_sub_ch) = g.subscript
221 && let Ok(charset_sub_ch) = subscript_char(ch)
222 {
223 assert_eq!(
224 glyph_sub_ch,
225 charset_sub_ch,
226 "glyph {g:?} maps {ch} to {glyph_sub_ch} ({}) but subscript_char maps it to {charset_sub_ch} ({})",
227 glyph_sub_ch.escape_unicode(),
228 charset_sub_ch.escape_unicode(),
229 );
230 }
231 }
232}
233
234#[test]
235fn test_superscript_char_agreement() {
236 for g in ALL_GLYPHS {
237 if let Some(ch) = g.normal
238 && let Some(glyph_sup_ch) = g.superscript
239 && let Ok(charset_sup_ch) = superscript_char(ch)
240 {
241 assert_eq!(
242 glyph_sup_ch,
243 charset_sup_ch,
244 "glyph {g:?} maps {ch} to {glyph_sup_ch} ({}) but superscript_char maps it to {charset_sup_ch} ({})",
245 glyph_sup_ch.escape_unicode(),
246 charset_sup_ch.escape_unicode(),
247 );
248 }
249 }
250}
251
252#[test]
253fn test_glyph_names_do_not_contain_underscore() {
254 // Because sup_ and sub_ use an underscore as a kind of separator,
255 // it's probably too confusing to allow them in glyph names. So
256 // this test prevents someone using one.
257 //
258 // I am currently also considering a convention in which a
259 // combining character is specified by including the name of the
260 // second character inside @...@. For example @square_minus@ to
261 // denote a square (which does not advance the carriage) followed
262 // by a minus sign (which does). This plan would require us to
263 // treat '_' as a glyph name separator.
264 for g in ALL_GLYPHS {
265 assert!(
266 !g.name.contains('_'),
267 "glyph name {} should not contain an underscore",
268 &g.name
269 );
270 }
271}
272
273// TODO: probably doesn't need to be a module.
274mod shape {
275 //! Used to limit effect of `allow(non_camel_case_types)`;
276 //! probably not needed.
277
278 /// Lincoln Writer character shapes.
279 ///
280 /// All character shapes in the character set table from page 2 of
281 /// the documentation on the Lincoln Writer channels (65, 66).
282 /// TX-2 Users Handbook, July 1961.
283 #[allow(non_camel_case_types)]
284 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
285 pub(crate) enum GlyphShape {
286 Digit0,
287 Digit1,
288 Digit2,
289 Digit3,
290 Digit4,
291 Digit5,
292 Digit6,
293 Digit7,
294 Digit8,
295 Digit9,
296 Underscore,
297 Circle,
298 A,
299 B,
300 C,
301 D,
302 E,
303 F,
304 G,
305 H,
306 I,
307 J,
308 K,
309 L,
310 M,
311 N,
312 O,
313 P,
314 Q,
315 R,
316 S,
317 T,
318 U,
319 V,
320 W,
321 X,
322 Y,
323 Z,
324 LeftParen,
325 RightParen,
326 Add,
327 Minus,
328 Comma,
329 Dot,
330 // No CARRIAGE RETURN
331 Tab,
332 Backspace,
333 // No COLOR BLACK, SUPER, NORMAL, SUB, COLOR RED
334 Space,
335 // No WORD EXAM, LINE FEED DOWN, LINE FEED UP, LOWER CASE, UPPER
336 // CASE, STOP, NULLIFY.
337 Hand,
338 Sigma,
339 Pipe,
340 DoublePipe,
341 Solidus,
342 Times,
343 Hash,
344 Arrow,
345 LessThan,
346 GreaterThan,
347 Overbar,
348 Square,
349 n,
350 SubsetOf,
351 Or,
352 q,
353 Gamma,
354 t,
355 w,
356 x,
357 i,
358 y,
359 z,
360 Query,
361 Union,
362 Intersection,
363 j,
364 k,
365 Alpha,
366 Delta,
367 p,
368 Epsilon,
369 h,
370 SupersetOf,
371 Beta,
372 And,
373 Lambda,
374 Tilde,
375 LeftBrace,
376 RightBrace,
377 IdenticalTo, /* hamb */
378 Equals,
379 Apostrophe,
380 Asterisk,
381 }
382}
383pub(crate) use shape::GlyphShape;
384
385/// Indicates that a Unicode character does not exist in the TX-2 character set.
386#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
387pub(crate) struct NotInCharacterSet(pub char);
388
389impl Display for NotInCharacterSet {
390 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
391 write!(
392 f,
393 "Character '{}' is not in the TX-2's Lincoln Writer character set",
394 self.0
395 )
396 }
397}
398
399impl Error for NotInCharacterSet {}
400
401/// Convert a superscript/subscript/normal Unicode character, if we
402/// recognise it, into [`Elevated<&'static Glyph>`].
403pub(crate) fn glyph_of_char(original: char) -> Result<Elevated<&'static Glyph>, Unrecognised> {
404 let ch: char = canonicalise_char(original);
405 let mapping = glyph_map();
406 match mapping.get(ch) {
407 Some(elevated) => Ok(elevated),
408 None => Err(Unrecognised::InvalidChar(original)),
409 }
410}
411
412#[test]
413fn test_space_is_normal() {
414 match glyph_of_char(' ') {
415 Ok(elevated) => {
416 assert_eq!(elevated.script(), Script::Normal);
417 }
418 Err(e) => {
419 panic!("unexpected failure to look up space: {e:?}");
420 }
421 }
422}
423
424impl TryFrom<char> for Elevated<&'static Glyph> {
425 type Error = NotInCharacterSet;
426
427 fn try_from(ch: char) -> Result<Self, Self::Error> {
428 glyph_of_char(ch).map_err(|_| NotInCharacterSet(ch))
429 }
430}
431
432#[test]
433fn test_glyph_of_dot() {
434 assert_eq!(glyph_of_char('.'), glyph_of_char('·'));
435}
436
437//const fn code_point_of_shape(g: GlyphShape) -> (LwCase, Unsigned6Bit) {
438// use base::charset::LwCase;
439// use base::Pu6, Unsigned6Bit};
440// // Information taken from the character set table from page 2 of
441// // the documentation on the Lincoln Writer channels (65, 66).
442// // TX-2 Users Handbook, July 1961.
443// const L: LwCase = LwCase::Lower;
444// const U: LwCase = LwCase::Upper;
445// match g {
446// GlyphShape::Digit0 => (L, u6!(0)),
447// GlyphShape::Digit1 => (L, u6!(1)),
448// GlyphShape::Digit2 => (L, u6!(2)),
449// GlyphShape::Digit3 => (L, u6!(3)),
450// GlyphShape::Digit4 => (L, u6!(4)),
451// GlyphShape::Digit5 => (L, u6!(5)),
452// GlyphShape::Digit6 => (L, u6!(6)),
453// GlyphShape::Digit7 => (L, u6!(7)),
454// GlyphShape::Digit8 => (L, u6!(0o10)),
455// GlyphShape::Digit9 => (L, u6!(0o11)),
456// GlyphShape::Underscore => (L, u6!(0o12)),
457// GlyphShape::Circle => (L, u6!(0o13)),
458// GlyphShape::A => (L, u6!(0o20)),
459// GlyphShape::B => (L, u6!(0o21)),
460// GlyphShape::C => (L, u6!(0o22)),
461// GlyphShape::D => (L, u6!(0o23)),
462// GlyphShape::E => (L, u6!(0o24)),
463// GlyphShape::F => (L, u6!(0o25)),
464// GlyphShape::G => (L, u6!(0o26)),
465// GlyphShape::H => (L, u6!(0o27)),
466// GlyphShape::I => (L, u6!(0o30)),
467// GlyphShape::J => (L, u6!(0o31)),
468// GlyphShape::K => (L, u6!(0o32)),
469// GlyphShape::L => (L, u6!(0o33)),
470// GlyphShape::M => (L, u6!(0o34)),
471// GlyphShape::N => (L, u6!(0o35)),
472// GlyphShape::O => (L, u6!(0o36)),
473// GlyphShape::P => (L, u6!(0o37)),
474// GlyphShape::Q => (L, u6!(0o40)),
475// GlyphShape::R => (L, u6!(0o41)),
476// GlyphShape::S => (L, u6!(0o42)),
477// GlyphShape::T => (L, u6!(0o43)),
478// GlyphShape::U => (L, u6!(0o44)),
479// GlyphShape::V => (L, u6!(0o45)),
480// GlyphShape::W => (L, u6!(0o46)),
481// GlyphShape::X => (L, u6!(0o47)),
482// GlyphShape::Y => (L, u6!(0o50)),
483// GlyphShape::Z => (L, u6!(0o51)),
484// GlyphShape::LeftParen => (L, u6!(0o52)),
485// GlyphShape::RightParen => (L, u6!(0o53)),
486// GlyphShape::Add => (L, u6!(0o54)),
487// GlyphShape::Minus => (L, u6!(0o55)),
488// GlyphShape::Comma => (L, u6!(0o56)),
489// GlyphShape::Dot => (L, u6!(0o57)),
490// GlyphShape::Tab => (L, u6!(0o61)),
491// GlyphShape::Backspace => (L, u6!(0o62)),
492// // 0o63 is COLOR BLACK
493// //
494// // 0o64 is SUPER
495// //
496// // 0o65 is NORMAL
497// //
498// // 0o66 is SUB
499// //
500// // 0o67 is COLOR RED
501// GlyphShape::Space => (L, u6!(0o70)),
502// // 0o71 is WORD EXAM
503// //
504// // 0o72 is LINE FEED DOWN
505// //
506// // 0o73 is LINE FEED UP
507// //
508// // 0o74 is LOWER CASE
509// //
510// // 0o75 is UPPER CASE
511// //
512// // 0o76 is STOP
513// //
514// // 0o77 is NULLIFY
515// GlyphShape::Hand => (U, u6!(0)),
516// GlyphShape::Sigma => (U, u6!(1)),
517// GlyphShape::Pipe => (U, u6!(2)),
518// GlyphShape::DoublePipe => (U, u6!(3)),
519// GlyphShape::Solidus => (U, u6!(4)),
520// GlyphShape::Times => (U, u6!(5)),
521// GlyphShape::Hash => (U, u6!(6)),
522// GlyphShape::Arrow => (U, u6!(7)),
523// GlyphShape::LessThan => (U, u6!(0o10)),
524// GlyphShape::GreaterThan => (U, u6!(0o11)),
525// GlyphShape::Overbar => (U, u6!(0o12)),
526// GlyphShape::Square => (U, u6!(0o13)),
527// // 0o14 is "READ IN"
528// //
529// // 0o15 is "BEGIN"
530// //
531// // 0o16 is "NO"
532// //
533// // 0o17 is "YES"
534// GlyphShape::n => (U, u6!(0o20)),
535// GlyphShape::SubsetOf => (U, u6!(0o21)),
536// GlyphShape::Or => (U, u6!(0o22)),
537// GlyphShape::q => (U, u6!(0o23)),
538// GlyphShape::Gamma => (U, u6!(0o24)),
539// GlyphShape::t => (U, u6!(0o25)),
540// GlyphShape::w => (U, u6!(0o26)),
541// GlyphShape::x => (U, u6!(0o27)),
542// GlyphShape::i => (U, u6!(0o30)),
543// GlyphShape::y => (U, u6!(0o31)),
544// GlyphShape::z => (U, u6!(0o32)),
545// GlyphShape::Query => (U, u6!(0o33)),
546// GlyphShape::Union => (U, u6!(0o34)),
547// GlyphShape::Intersection => (U, u6!(0o35)),
548// GlyphShape::j => (U, u6!(0o36)),
549// GlyphShape::k => (U, u6!(0o37)),
550// GlyphShape::Alpha => (U, u6!(0o40)),
551// GlyphShape::Delta => (U, u6!(0o41)),
552// GlyphShape::p => (U, u6!(0o42)),
553// GlyphShape::Epsilon => (U, u6!(0o43)),
554// GlyphShape::h => (U, u6!(0o44)),
555// GlyphShape::SupersetOf => (U, u6!(0o45)),
556// GlyphShape::Beta => (U, u6!(0o46)),
557// GlyphShape::And => (U, u6!(0o47)),
558// GlyphShape::Lambda => (U, u6!(0o50)),
559// GlyphShape::Tilde => (U, u6!(0o51)),
560// GlyphShape::LeftBrace => (U, u6!(0o52)),
561// GlyphShape::RightBrace => (U, u6!(0o53)),
562// GlyphShape::IdenticalTo => (U, u6!(0o54)), // @hamb@
563// GlyphShape::Equals => (U, u6!(0o55)),
564// GlyphShape::Apostrophe => (U, u6!(0o56)),
565// GlyphShape::Asterisk => (U, u6!(0o57)),
566// // Code points 0o60 to 0o77 are non-graphinc characters.
567// }
568//}
569
570/// Used to save typing to provide defaults in the definitions in
571/// [`ALL_GLYPHS`].
572const GDEF: Glyph = Glyph {
573 shape: GlyphShape::Hand,
574 name: "",
575 normal: None,
576 superscript: None,
577 subscript: None,
578 advance: true,
579};
580
581/// Symbols understood by the M4 assembler (other than compound
582/// symbols).
583///
584/// Information taken from the character set table from page 2 of
585/// the documentation on the Lincoln Writer channels (65, 66).
586/// TX-2 Users Handbook, July 1961.
587const ALL_GLYPHS: &[Glyph] = &[
588 Glyph {
589 shape: GlyphShape::Digit0,
590 name: "0",
591 normal: Some('0'),
592 superscript: Some('⁰'),
593 subscript: Some('₀'),
594 ..GDEF
595 },
596 Glyph {
597 shape: GlyphShape::Digit1,
598 name: "1",
599 normal: Some('1'),
600 subscript: Some('₁'),
601 superscript: Some('¹'),
602 ..GDEF
603 },
604 Glyph {
605 shape: GlyphShape::Digit2,
606 name: "2",
607 normal: Some('2'),
608 subscript: Some('₂'),
609 superscript: Some('²'),
610 ..GDEF
611 },
612 Glyph {
613 shape: GlyphShape::Digit3,
614 name: "3",
615 normal: Some('3'),
616 subscript: Some('₃'),
617 superscript: Some('³'),
618 ..GDEF
619 },
620 Glyph {
621 shape: GlyphShape::Digit4,
622 name: "4",
623 normal: Some('4'),
624 subscript: Some('₄'),
625 superscript: Some('⁴'),
626 ..GDEF
627 },
628 Glyph {
629 shape: GlyphShape::Digit5,
630 name: "5",
631 normal: Some('5'),
632 subscript: Some('₅'),
633 superscript: Some('⁵'),
634 ..GDEF
635 },
636 Glyph {
637 shape: GlyphShape::Digit6,
638 name: "6",
639 normal: Some('6'),
640 subscript: Some('₆'),
641 superscript: Some('⁶'),
642 ..GDEF
643 },
644 Glyph {
645 shape: GlyphShape::Digit7,
646 name: "7",
647 normal: Some('7'),
648 subscript: Some('₇'),
649 superscript: Some('⁷'),
650 ..GDEF
651 },
652 Glyph {
653 shape: GlyphShape::Digit8,
654 name: "8",
655 normal: Some('8'),
656 subscript: Some('₈'),
657 superscript: Some('⁸'),
658 ..GDEF
659 },
660 Glyph {
661 shape: GlyphShape::Digit9,
662 name: "9",
663 normal: Some('9'),
664 subscript: Some('₉'),
665 superscript: Some('⁹'),
666 ..GDEF
667 },
668 Glyph {
669 shape: GlyphShape::Underscore,
670 name: "underscore",
671 // This character does not advance the carriage, so instead of
672 // representing it with ASCII \x5F (underscore) we use a
673 // combining low line.
674 normal: Some('\u{0332}'), // U+0332, combining low line
675 advance: false,
676 ..GDEF
677 },
678 Glyph {
679 shape: GlyphShape::Circle,
680 name: "circle",
681 // U+25CB, Unicode white circle, '○', advances the cursor
682 // position, which the Lincoln Writer code (0o13) doesn't do.
683 // So we use a combining character.
684 normal: Some('\u{20DD}'), // U+20DD, combining enclosing circle
685 advance: false,
686 ..GDEF
687 },
688 // 0o14 is "READ IN"
689 //
690 // 0o15 is "BEGIN"
691 //
692 // 0o16 is "NO"
693 //
694 // 0o17 is "YES"
695 Glyph {
696 shape: GlyphShape::A,
697 name: "A",
698 normal: Some('A'),
699 superscript: Some('ᴬ'),
700 ..GDEF
701 },
702 Glyph {
703 shape: GlyphShape::B,
704 name: "B",
705 normal: Some('B'),
706 superscript: Some('ᴮ'),
707 ..GDEF
708 },
709 Glyph {
710 shape: GlyphShape::C,
711 name: "C",
712 normal: Some('C'),
713 superscript: Some('ꟲ'), // U+A7F2 (we don't use U+1D9C, that's the lower-case C)
714 ..GDEF
715 },
716 Glyph {
717 shape: GlyphShape::D,
718 name: "D",
719 normal: Some('D'),
720 superscript: Some('ᴰ'),
721 ..GDEF
722 },
723 Glyph {
724 shape: GlyphShape::E,
725 name: "E",
726 normal: Some('E'),
727 superscript: Some('ᴱ'),
728 ..GDEF
729 },
730 Glyph {
731 shape: GlyphShape::F,
732 name: "F",
733 normal: Some('F'),
734 superscript: Some('ꟳ'),
735 ..GDEF
736 },
737 Glyph {
738 shape: GlyphShape::G,
739 name: "G",
740 normal: Some('G'),
741 superscript: Some('ᴳ'),
742 ..GDEF
743 },
744 Glyph {
745 shape: GlyphShape::H,
746 name: "H",
747 normal: Some('H'),
748 superscript: Some('ᴴ'),
749 ..GDEF
750 },
751 Glyph {
752 shape: GlyphShape::I,
753 name: "I",
754 normal: Some('I'),
755 superscript: Some('ᴵ'),
756 ..GDEF
757 },
758 Glyph {
759 shape: GlyphShape::J,
760 name: "J",
761 normal: Some('J'),
762 superscript: Some('ᴶ'),
763 ..GDEF
764 },
765 Glyph {
766 shape: GlyphShape::K,
767 name: "K",
768 normal: Some('K'),
769 superscript: Some('ᴷ'),
770 ..GDEF
771 },
772 Glyph {
773 shape: GlyphShape::L,
774 name: "L",
775 normal: Some('L'),
776 superscript: Some('ᴸ'),
777 ..GDEF
778 },
779 Glyph {
780 shape: GlyphShape::M,
781 name: "M",
782 normal: Some('M'),
783 superscript: Some('ᴹ'),
784 ..GDEF
785 },
786 Glyph {
787 shape: GlyphShape::N,
788 name: "N",
789 normal: Some('N'),
790 superscript: Some('ᴺ'),
791 ..GDEF
792 },
793 Glyph {
794 shape: GlyphShape::O,
795 name: "O",
796 normal: Some('O'),
797 superscript: Some('ᴼ'),
798 ..GDEF
799 },
800 Glyph {
801 shape: GlyphShape::P,
802 name: "P",
803 normal: Some('P'),
804 superscript: Some('ᴾ'),
805 ..GDEF
806 },
807 Glyph {
808 shape: GlyphShape::Q,
809 name: "Q",
810 normal: Some('Q'),
811 superscript: Some('ꟴ'),
812 ..GDEF
813 },
814 Glyph {
815 shape: GlyphShape::R,
816 name: "R",
817 normal: Some('R'),
818 superscript: Some('ᴿ'),
819 ..GDEF
820 },
821 Glyph {
822 shape: GlyphShape::S,
823 name: "S",
824 normal: Some('S'),
825 // There is no Unicode superscript 'S', U+2E2 is a superscript 's'.
826 superscript: None,
827 ..GDEF
828 },
829 Glyph {
830 shape: GlyphShape::T,
831 name: "T",
832 normal: Some('T'),
833 superscript: Some('ᵀ'),
834 ..GDEF
835 },
836 Glyph {
837 shape: GlyphShape::U,
838 name: "U",
839 normal: Some('U'),
840 superscript: Some('ᵁ'),
841 ..GDEF
842 },
843 Glyph {
844 shape: GlyphShape::V,
845 name: "V",
846 normal: Some('V'),
847 superscript: Some('ⱽ'),
848 ..GDEF
849 },
850 Glyph {
851 shape: GlyphShape::W,
852 name: "W",
853 normal: Some('W'),
854 superscript: Some('ᵂ'),
855 ..GDEF
856 },
857 Glyph {
858 shape: GlyphShape::X,
859 name: "X",
860 normal: Some('X'),
861 // There is no superscript X in Unicode.
862 ..GDEF
863 },
864 Glyph {
865 shape: GlyphShape::Y,
866 name: "Y",
867 normal: Some('Y'),
868 // There is no superscript Y in Unicode.
869 ..GDEF
870 },
871 Glyph {
872 shape: GlyphShape::Z,
873 name: "Z",
874 normal: Some('Z'),
875 // There is no superscript Z in Unicode.
876 ..GDEF
877 },
878 Glyph {
879 shape: GlyphShape::LeftParen,
880 name: "lparen",
881 normal: Some('('),
882 subscript: Some('₍'),
883 ..GDEF
884 },
885 Glyph {
886 shape: GlyphShape::RightParen,
887 name: "rparen",
888 normal: Some(')'),
889 subscript: Some('₎'),
890 ..GDEF
891 },
892 Glyph {
893 shape: GlyphShape::Add,
894 name: "add", // following sub.py
895 normal: Some('+'),
896 superscript: Some('⁺'),
897 subscript: Some('₊'),
898 ..GDEF
899 },
900 Glyph {
901 shape: GlyphShape::Minus,
902 name: "minus", // following sub.py
903 normal: Some('-'),
904 superscript: Some('⁻'),
905 subscript: Some('₋'),
906 ..GDEF
907 },
908 Glyph {
909 shape: GlyphShape::Comma,
910 name: "comma",
911 normal: Some(','),
912 ..GDEF
913 },
914 Glyph {
915 shape: GlyphShape::Dot,
916 name: "dot",
917 // This is a centre dot, not a period. We use a centre dot so
918 // that it's not confused with a subscript dot.
919 normal: Some('\u{00B7}'), // ·
920
921 // Using an ASCII full stop / period (".") would be too
922 // confusing for the user, who (when preparing source code
923 // input) might expect this to be interpreted as the
924 // normal-script PERIOD. So for subscript we instead use
925 // U+2024, "One Dot Leader".
926 subscript: Some('\u{2024}'), // "․" (not ASCII ".")
927 superscript: None,
928 ..GDEF
929 },
930 // CARRIAGE RETURN is missing.
931 Glyph {
932 shape: GlyphShape::Tab,
933 name: "tab",
934 normal: Some('\t'),
935 ..GDEF
936 },
937 Glyph {
938 // backspace is used in some combining-character symexes.
939 shape: GlyphShape::Backspace,
940 name: "backspace",
941 normal: None, // better to say @backspace@.
942 ..GDEF
943 },
944 // 0o63 is COLOR BLACK
945 //
946 // 0o64 is SUPER
947 //
948 // 0o65 is NORMAL
949 //
950 // 0o66 is SUB
951 //
952 // 0o67 is COLOR RED
953 Glyph {
954 shape: GlyphShape::Space,
955 name: "space",
956 normal: Some(' '),
957 subscript: Some(' '),
958 superscript: Some(' '),
959 ..GDEF
960 },
961 // 0o71 is WORD EXAM
962 //
963 // 0o72 is LINE FEED DOWN
964 //
965 // 0o73 is LINE FEED UP
966 //
967 // 0o74 is LOWER CASE
968 //
969 // 0o75 is UPPER CASE
970 //
971 // 0o76 is STOP
972 //
973 // 0o77 is NULLIFY
974 //
975 //
976 // Right-hand column of the character set table follows.
977 Glyph {
978 shape: GlyphShape::Hand,
979 name: "hand",
980 normal: Some('☛'), // U+261B
981 ..GDEF
982 },
983 Glyph {
984 shape: GlyphShape::Sigma,
985 name: "sigma",
986 normal: Some('Σ'), // U+03A3
987 ..GDEF
988 },
989 Glyph {
990 shape: GlyphShape::Pipe,
991 name: "pipe",
992 normal: Some('|'),
993 ..GDEF
994 },
995 Glyph {
996 shape: GlyphShape::DoublePipe,
997 name: "doublepipe",
998 normal: Some('‖'),
999 ..GDEF
1000 },
1001 Glyph {
1002 shape: GlyphShape::Solidus,
1003 name: "solidus", // better known as "slash".
1004 normal: Some('/'),
1005 ..GDEF
1006 },
1007 Glyph {
1008 shape: GlyphShape::Times,
1009 name: "times",
1010 normal: Some('×'),
1011 ..GDEF
1012 },
1013 Glyph {
1014 shape: GlyphShape::Hash,
1015 name: "hash",
1016 normal: Some('#'),
1017 ..GDEF
1018 },
1019 Glyph {
1020 shape: GlyphShape::Arrow,
1021 // arr not arrow to follow Jurij's sub.py
1022 name: "arr",
1023 normal: Some('\u{2192}'), // →
1024 ..GDEF
1025 },
1026 Glyph {
1027 shape: GlyphShape::LessThan,
1028 name: "lessthan",
1029 normal: Some('<'),
1030 ..GDEF
1031 },
1032 Glyph {
1033 shape: GlyphShape::GreaterThan,
1034 name: "greaterthan",
1035 normal: Some('>'),
1036 ..GDEF
1037 },
1038 Glyph {
1039 shape: GlyphShape::Overbar,
1040 name: "overbar",
1041 // This character does not advance the carriage, so we use a
1042 // combining character for it.
1043 normal: Some('\u{0305}'), // U+0305, combining overline
1044 superscript: None,
1045 subscript: None,
1046 advance: false,
1047 },
1048 Glyph {
1049 shape: GlyphShape::Square,
1050 name: "square",
1051 // This character does not advance the carriage, so instead of
1052 // using a character like U+25A1 ('□'), we use a combining
1053 // character.
1054 normal: Some('\u{20DE}'), // U+20DE, combining enclosing square
1055 subscript: None,
1056 superscript: None,
1057 advance: false,
1058 },
1059 // 0o14 is "READ IN"
1060 //
1061 // 0o15 is "BEGIN"
1062 //
1063 // 0o16 is "NO"
1064 //
1065 // 0o17 is "YES"
1066 Glyph {
1067 shape: GlyphShape::n,
1068 name: "n",
1069 normal: Some('n'),
1070 superscript: Some('ⁿ'),
1071 subscript: Some('ₙ'), // U+2099
1072 ..GDEF
1073 },
1074 Glyph {
1075 shape: GlyphShape::SubsetOf,
1076 name: "subsetof",
1077 normal: Some('\u{2282}'), // Subset of, ⊂
1078 ..GDEF
1079 },
1080 Glyph {
1081 shape: GlyphShape::Or,
1082 name: "or",
1083 normal: Some('∨'),
1084 ..GDEF
1085 },
1086 Glyph {
1087 shape: GlyphShape::q,
1088 name: "q",
1089 normal: Some('q'),
1090 superscript: None,
1091 // U+107A5 is a subscript q, but this is not widely supported,
1092 // so we don't use it. Instead the user should use "@sub_q@".
1093 subscript: None,
1094 ..GDEF
1095 },
1096 Glyph {
1097 shape: GlyphShape::Gamma,
1098 name: "gamma",
1099 normal: Some('γ'), // U+03B3, Greek small letter gamma
1100 superscript: Some('ᵞ'),
1101 subscript: Some('ᵧ'),
1102 ..GDEF
1103 },
1104 Glyph {
1105 shape: GlyphShape::t,
1106 name: "t",
1107 normal: Some('t'),
1108 superscript: Some('ᵗ'), // U+1D57
1109 subscript: Some('ₜ'), // U+209C
1110 ..GDEF
1111 },
1112 Glyph {
1113 shape: GlyphShape::w,
1114 name: "w",
1115 normal: Some('w'),
1116 superscript: Some('ʷ'),
1117 subscript: None,
1118 ..GDEF
1119 },
1120 Glyph {
1121 shape: GlyphShape::x,
1122 name: "x",
1123 normal: Some('x'),
1124 superscript: Some('ˣ'),
1125 subscript: Some('ₓ'), // U+2093
1126 ..GDEF
1127 },
1128 Glyph {
1129 shape: GlyphShape::i,
1130 name: "i",
1131 normal: Some('i'),
1132 superscript: Some('ⁱ'),
1133 subscript: Some('ᵢ'),
1134 ..GDEF
1135 },
1136 Glyph {
1137 shape: GlyphShape::y,
1138 name: "y",
1139 normal: Some('y'),
1140 superscript: Some('ʸ'),
1141 subscript: None,
1142 ..GDEF
1143 },
1144 Glyph {
1145 shape: GlyphShape::z,
1146 name: "z",
1147 normal: Some('z'),
1148 subscript: None,
1149 superscript: Some('ᶻ'),
1150 ..GDEF
1151 },
1152 Glyph {
1153 shape: GlyphShape::Query, // A question mark.
1154 name: "?",
1155 normal: Some('?'),
1156 superscript: Some('ˀ'), // dot is missing but it's the best we can do.
1157 // U+FE56, "Small Question Mark" is not really a subscript
1158 // character, but let's try it out.
1159 subscript: Some('﹖'),
1160 ..GDEF
1161 },
1162 Glyph {
1163 shape: GlyphShape::Union,
1164 name: "union",
1165 normal: Some('∪'),
1166 superscript: None,
1167 subscript: None,
1168 ..GDEF
1169 },
1170 Glyph {
1171 shape: GlyphShape::Intersection,
1172 name: "intersection",
1173 normal: Some('\u{2229}'),
1174 subscript: None,
1175 superscript: None,
1176 ..GDEF
1177 },
1178 Glyph {
1179 shape: GlyphShape::j,
1180 name: "j",
1181 normal: Some('j'),
1182 superscript: Some('ʲ'), // U+02B2
1183 subscript: Some('ⱼ'), // U+2C7C
1184 ..GDEF
1185 },
1186 Glyph {
1187 shape: GlyphShape::k,
1188 name: "k",
1189 normal: Some('k'),
1190 superscript: Some('ᵏ'),
1191 subscript: Some('ₖ'), // U+2096
1192 ..GDEF
1193 },
1194 Glyph {
1195 shape: GlyphShape::Alpha,
1196 name: "alpha",
1197 normal: Some('α'), // U+03B1, alpha
1198 // this is actually a Latin superscript alpha, but it will normally look the same.
1199 superscript: Some('ᵅ'),
1200 subscript: None,
1201 ..GDEF
1202 },
1203 Glyph {
1204 shape: GlyphShape::Delta,
1205 name: "delta",
1206 normal: Some('Δ'), // U+0395, capital delta
1207 ..GDEF
1208 },
1209 Glyph {
1210 shape: GlyphShape::p,
1211 name: "p",
1212 normal: Some('p'),
1213 superscript: Some('ᵖ'),
1214 subscript: Some('ₚ'), // U+209A
1215 ..GDEF
1216 },
1217 Glyph {
1218 shape: GlyphShape::Epsilon,
1219 name: "eps",
1220 normal: Some('ε'), // U+03B5, Epsilon (not ∈, Element of)
1221 superscript: Some('ᵋ'), // U+1D4B
1222 subscript: None,
1223 ..GDEF
1224 },
1225 Glyph {
1226 shape: GlyphShape::h,
1227 name: "h",
1228 normal: Some('h'),
1229 superscript: Some('ʰ'),
1230 subscript: Some('ₕ'),
1231 ..GDEF
1232 },
1233 Glyph {
1234 shape: GlyphShape::SupersetOf,
1235 name: "sup", // name aligns with Jurij's sub.py
1236 normal: Some('⊃'), // U+2283, superset of
1237 superscript: None,
1238 subscript: None,
1239 ..GDEF
1240 },
1241 Glyph {
1242 shape: GlyphShape::Beta,
1243 name: "beta",
1244 normal: Some('β'), // U+03B2, Greek beta symbol
1245 superscript: Some('ᵝ'), // U+1D5D
1246 subscript: Some('ᵦ'), // U+1D66
1247 ..GDEF
1248 },
1249 Glyph {
1250 shape: GlyphShape::And,
1251 name: "and",
1252 normal: Some('∧'), // U+2227, Logical And
1253 superscript: None,
1254 subscript: None,
1255 ..GDEF
1256 },
1257 Glyph {
1258 shape: GlyphShape::Lambda,
1259 name: "lambda",
1260 normal: Some('λ'), // U+3BB, Greek letter lambda
1261 superscript: None,
1262 subscript: None,
1263 ..GDEF
1264 },
1265 Glyph {
1266 shape: GlyphShape::Tilde,
1267 name: "tilde",
1268 normal: Some('~'),
1269 ..GDEF
1270 },
1271 Glyph {
1272 shape: GlyphShape::LeftBrace,
1273 name: "leftbrace",
1274 normal: Some('{'),
1275 ..GDEF
1276 },
1277 Glyph {
1278 shape: GlyphShape::RightBrace,
1279 name: "rightbrace",
1280 normal: Some('}'),
1281 ..GDEF
1282 },
1283 Glyph {
1284 shape: GlyphShape::IdenticalTo,
1285 name: "hamb", // following Jurij's sub.py
1286 normal: Some('≡'), // U+2261, Identical to (Jurij used ☰, U+2630, Trigram For Heaven)
1287 ..GDEF
1288 },
1289 Glyph {
1290 shape: GlyphShape::Equals,
1291 name: "equals",
1292 normal: Some('='),
1293 subscript: Some('₌'),
1294 ..GDEF
1295 },
1296 Glyph {
1297 shape: GlyphShape::Apostrophe,
1298 name: "apostrophe",
1299 normal: Some('\''),
1300 ..GDEF
1301 },
1302 Glyph {
1303 shape: GlyphShape::Asterisk,
1304 name: "asterisk",
1305 normal: Some('*'),
1306 subscript: None,
1307 superscript: None,
1308 ..GDEF
1309 },
1310 // Code points 0o60 to 0o77 are non-graphinc characters.
1311];
1312
1313/// Maps Unicode characters onto [`Glyph`] instances describing them.
1314#[derive(Debug, Clone, PartialEq, Eq)]
1315pub(crate) struct GlyphMapByChar {
1316 mapping: HashMap<char, Elevated<&'static Glyph>>,
1317}
1318
1319/// Read-only shared instance of [`GlyphMapByChar`].
1320static GLYPH_MAP_BY_CHAR: OnceLock<GlyphMapByChar> = OnceLock::new();
1321
1322impl Default for GlyphMapByChar {
1323 fn default() -> Self {
1324 let mut mapping = HashMap::new();
1325 for g in ALL_GLYPHS {
1326 for script in [Script::Sub, Script::Super, Script::Normal] {
1327 if g.normal == Some(' ') && script != Script::Normal {
1328 // Note that the space character has the same
1329 // representation in normal script, superscript
1330 // and subscript. We have a convention that space
1331 // is always deemed to be in normal script.
1332 continue;
1333 }
1334 if let Some(key) = g.get_char(script) {
1335 let value = elevate(script, g);
1336 if let Some(prev) = mapping.insert(key, value) {
1337 panic!("duplicate glyph mapping for character '{key}': {g:?} and {prev:?}");
1338 }
1339 }
1340 }
1341 }
1342 Self { mapping }
1343 }
1344}
1345
1346impl GlyphMapByChar {
1347 fn get(&self, ch: char) -> Option<Elevated<&'static Glyph>> {
1348 self.mapping.get(&ch).copied()
1349 }
1350}
1351
1352/// Return a reference to the shared instance of [`GlyphMapByChar`].
1353pub(crate) fn glyph_map() -> &'static GlyphMapByChar {
1354 GLYPH_MAP_BY_CHAR.get_or_init(GlyphMapByChar::default)
1355}
1356
1357/// Additional mappings of Unicode input to prevent user confusion.
1358///
1359/// We use the centre dot ("·", U+00B7) as a decimal point, but also
1360/// accept "." U+002E, because the latter is likely to be a common
1361/// choice.
1362///
1363/// We also provide ":" as a synonym for "h" in setting the hold bit
1364/// in order to accept source code from earlier papers which used the
1365/// older convention, such as H. Philip Peterson's "[Some Examples of
1366/// TX-2
1367/// Programming](http://www.bitsavers.org/pdf/mit/tx-2/6M-5780_Some_Examples_of_TX-2_Programming_Jul1958.pdf)"
1368/// (Lincoln Lab memo 6M-5780, 23 July 1958).
1369fn canonicalise_char(ch: char) -> char {
1370 match ch {
1371 // We don't convert U+A7F2 (ꟲ) to U+1D9C because the former is
1372 // a majuscule (captial) letter and the latter is a minuscule
1373 // (lower-case) letter.
1374 '.' => '\u{00B7}', // . -> ·
1375
1376 // The TX-2 character set doesn't include ':', but some of the
1377 // older sources use ':' to signal that the hold bit should be
1378 // set in an instruction. In the Users Handbook (in 1961 at
1379 // least) this function is performed by 'h'.
1380 ':' => 'h',
1381 _ => ch,
1382 }
1383}
1384
1385/// Convert a Unicode character into its `@...@` synonym.
1386pub(crate) fn name_from_glyph(mut ch: char) -> Option<&'static str> {
1387 // TODO: do we need both this and glyph_from_name?
1388 ch = canonicalise_char(ch);
1389 ALL_GLYPHS
1390 .iter()
1391 .find(|g| g.normal == Some(ch))
1392 .map(|g| g.name)
1393}
1394
1395/// Convert a Unicode string into [`Elevated<&'static Glyph>`].
1396///
1397/// Return `None` if the name of the glyph is not recognised.
1398pub(crate) fn glyph_from_name(name: &str) -> Option<Elevated<&'static Glyph>> {
1399 let (script, glyph_base_name) = if let Some(suffix) = name.strip_prefix("sub_") {
1400 (Script::Sub, suffix)
1401 } else if let Some(suffix) = name.strip_prefix("sup_") {
1402 (Script::Super, suffix)
1403 } else {
1404 (Script::Normal, name)
1405 };
1406 ALL_GLYPHS
1407 .iter()
1408 .find(|g| g.name == glyph_base_name)
1409 .map(|g| elevate(script, g))
1410}
1411
1412/// Return true if this character is allowed in a symex (symbol name).
1413///
1414/// Specified in Users Handbook section 6-2.3 item 6.
1415pub(crate) fn is_allowed_in_symex(g: GlyphShape) -> bool {
1416 match g {
1417 // Eeasier to understand if we don't re-order the match arms.
1418 #![allow(clippy::match_same_arms)]
1419 GlyphShape::Digit0 |
1420 GlyphShape::Digit1 |
1421 GlyphShape::Digit2 |
1422 GlyphShape::Digit3 |
1423 GlyphShape::Digit4 |
1424 GlyphShape::Digit5 |
1425 GlyphShape::Digit6 |
1426 GlyphShape::Digit7 |
1427 GlyphShape::Digit8 |
1428 GlyphShape::Digit9 |
1429 GlyphShape::A |
1430 GlyphShape::B |
1431 GlyphShape::C |
1432 GlyphShape::D |
1433 GlyphShape::E |
1434 GlyphShape::F |
1435 GlyphShape::G |
1436 GlyphShape::H |
1437 GlyphShape::I |
1438 GlyphShape::J |
1439 GlyphShape::K |
1440 GlyphShape::L |
1441 GlyphShape::M |
1442 GlyphShape::N |
1443 GlyphShape::O |
1444 GlyphShape::P |
1445 GlyphShape::Q |
1446 GlyphShape::R |
1447 GlyphShape::S |
1448 GlyphShape::T |
1449 GlyphShape::U |
1450 GlyphShape::V |
1451 GlyphShape::W |
1452 GlyphShape::X |
1453 GlyphShape::Y |
1454 GlyphShape::Z |
1455 GlyphShape::Alpha|
1456 GlyphShape::Beta |
1457 GlyphShape::Gamma |
1458 GlyphShape::Delta |
1459 GlyphShape::Epsilon |
1460 GlyphShape::Lambda |
1461 // Note: h is not allowed.
1462 GlyphShape::i |
1463 GlyphShape::j |
1464 GlyphShape::k |
1465 GlyphShape::n |
1466 GlyphShape::p |
1467 GlyphShape::q |
1468 GlyphShape::t |
1469 GlyphShape::w |
1470 GlyphShape::x |
1471 GlyphShape::y |
1472 GlyphShape::z |
1473 GlyphShape::Dot |
1474 GlyphShape::Apostrophe |
1475 GlyphShape::Underscore |
1476 GlyphShape::Overbar |
1477 GlyphShape::Square |
1478 GlyphShape::Circle => true,
1479 GlyphShape::Space => {
1480 // Space bar is allowed in a symex, per section 6-2.3.
1481 // But that doesn't necessarily mean that other space
1482 // characters are. However, we treat space and tab the
1483 // same, and don't include them in the symex syllable
1484 // token (instead we join symex syllables together in the
1485 // parser).
1486 true
1487 }
1488 _ => false,
1489 }
1490}