assembler/
lexer.rs

1//! Turn text input into a sequence of tokens.
2//!
3//! We perform two levels of lexing:
4//!
5//! - [`lower`]: handles comments, annotations, spaces and newlines.
6//!   Generates a sequence of [`lower::Lexeme`] instances.
7//! - [`Lexer`]: pulls text from the lower-level lexer and recognizes tokens.
8//!   Generates a sequence of [`Token`] instances.
9use std::{
10    fmt::{Display, Write},
11    ops::Range,
12    str::CharIndices,
13};
14
15use base::{
16    Unsigned36Bit,
17    charset::{Script, subscript_char, superscript_char},
18    error::StringConversionFailed,
19};
20
21use super::{
22    glyph::{
23        Elevated, Glyph, GlyphShape, Unrecognised, elevate, glyph_from_name, glyph_of_char,
24        is_allowed_in_symex,
25    },
26    parser::helpers,
27    state::NumeralMode,
28};
29
30#[cfg(test)]
31mod input_file_tests;
32mod lower;
33#[cfg(test)]
34mod tests;
35
36type Span = Range<usize>;
37
38pub(crate) const DOT_CHAR: char = '·';
39pub(crate) const DOT_STR: &str = "·";
40
41#[derive(Debug, PartialEq, Eq, Clone)]
42pub(crate) struct NumericLiteral {
43    /// The digits comprising the literal.
44    digits: String,
45
46    /// Indicates that the literal has a trailing dot.
47    ///
48    /// A trailing dot indicates that the literal has a non-default
49    /// base (i.e. if the default is decimal, "." signals octal and if
50    /// the default is otal, "." signals decimal).
51    ///
52    /// We don't know whether the base is decimal or octal within the
53    /// lexer, because the state information that tracks the current
54    /// default base belongs to the parser (as the parser has to
55    /// interpret the meta commands which switch the default).
56    has_trailing_dot: bool,
57}
58
59impl Display for NumericLiteral {
60    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
61        f.write_str(self.digits.as_str())?;
62        if self.has_trailing_dot {
63            f.write_char(DOT_CHAR)?;
64        }
65        Ok(())
66    }
67}
68
69impl NumericLiteral {
70    pub(crate) fn make_num(
71        &self,
72        mode: NumeralMode,
73    ) -> Result<Unsigned36Bit, StringConversionFailed> {
74        helpers::make_num(self.digits.as_str(), self.has_trailing_dot, mode)
75    }
76
77    pub(crate) fn append_digits_of_literal(&mut self, other: &NumericLiteral) {
78        assert!(!other.has_trailing_dot);
79        self.digits.push_str(&other.digits);
80    }
81
82    pub(crate) fn has_trailing_dot(&self) -> bool {
83        self.has_trailing_dot
84    }
85
86    pub(crate) fn take_digits(self) -> String {
87        self.digits
88    }
89}
90
91/// Represents an item in the input we didn't recognise of which we
92/// don't support.
93#[derive(Debug, PartialEq, Eq, Clone)]
94pub(crate) enum ErrorTokenKind {
95    /// Use of a tab character causes an error diagnostic; we don't
96    /// support tabs in the same way as M4 does, since the `☛☛T` meta
97    /// command is not implemented.
98    Tab,
99    UnrecognisedGlyph(Unrecognised),
100}
101
102impl Display for ErrorTokenKind {
103    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
104        match self {
105            ErrorTokenKind::Tab => {
106                const LONG_MSG: &str = concat!(
107                    "Please do not use the TAB character. ",
108                    "The differences between the M4 assembler's interpretation of tab and its interpreation of the space ",
109                    "characer are complex, and these are not fully implemented.  If you want to ",
110                    "prevent two adjacent symexes being joined together, please use parentheses ",
111                    "or an explicit '+' operation instead.  That is, use (A)(B) or A+B instead of A<tab>B. ",
112                    "If you intended to simply use TAB to produce some particular code layout, please ",
113                    "use spaces instead.",
114                );
115                f.write_str(LONG_MSG)
116            }
117            ErrorTokenKind::UnrecognisedGlyph(e) => write!(f, "{e}"),
118        }
119    }
120}
121
122/// The parser consumes these tokens.
123#[derive(Debug, PartialEq, Eq, Clone)]
124pub(crate) enum Token {
125    // In order for the parser to recover from tokenization errors, we
126    // need to be able to emit an error token.
127    Error(ErrorTokenKind),
128    LeftBrace(Script),
129    RightBrace(Script),
130    Newline,
131    Tab,
132
133    /// The parser currently only handled parenthesised expressions in
134    /// normal script.
135    LeftParen(Script),
136
137    /// The parser currently only handled parenthesised expressions in
138    /// normal script.
139    RightParen(Script),
140
141    /// Accept either 'h' or ':' signalling the hold bit (of the
142    /// instruction word) should be set.  The documentation seems to
143    /// use both, though perhaps ':' is the older usage.
144    ///
145    /// While h is indeed a letter, it is not one of the letters which
146    /// can form part of a symex.  See the TX-2 Users Handbook,
147    /// section 6-3.2, "RULES FOR SYMEX FORMATION".
148    Hold,
149    NotHold, // handled specially, there is no glyph for this.
150    Arrow(Script),
151    Hand(Script),
152    Hash(Script),
153    Equals(Script),
154
155    /// Asterisk is used quite heavily (indicating deferred addressing)
156    /// but while the TX-2 supports superscript and subscript
157    /// asterisks, they don't seem to be used.  They are not valid as
158    /// part of a symex (see User handbook, section 6-2.3) and are not
159    /// macro terminators (6-4.5).  However, they are valid as part of
160    /// a superposed character sequence making up a compound-character
161    /// macro name.
162    Asterisk(Script),
163
164    Pipe(Script),
165    DoublePipe(Script),
166    ProperSuperset(Script),
167    SubsetOf(Script),
168    IdenticalTo(Script),
169    Tilde(Script),
170    LessThan(Script),
171    GreaterThan(Script),
172    Query(Script), // question mark, i.e. "?"
173    Intersection(Script),
174    Union(Script),
175
176    /// Solidus is often called "slash" but people often confuse slash
177    /// and backslash.  So we don't call it either.
178    Solidus(Script),
179
180    // @plus@ is actually not the correct glyph name, following sub.py.
181    Plus(Script),
182    Minus(Script),
183    Times(Script),
184    LogicalOr(Script),
185    LogicalAnd(Script),
186
187    // Any unary "-" is handled in the parser.
188    Digits(Script, NumericLiteral),
189
190    // Used as the index component of instructions like MKZ₄.₁₀
191    BitPosition(Script, String, String),
192
193    // TODO: missing from this are: overbar, square, circle.
194    /// The rules concerning which characters can be part of a symex
195    /// are given in the TX-2 Users Handbook, section 6-3.2, "RULES
196    /// FOR SYMEX FORMATION".
197    ///
198    /// We so not accept dot as part of this token becuase it behaves
199    /// differently in some circumstances (it is a macro terminator).
200    /// However it is part of a valid symex also, and so we will need
201    /// to parse it as such.
202    SymexSyllable(Script, String),
203
204    // If change the representation of the dot in the token
205    // definition, please also change DOT_CHAR.
206    //
207    // The Dot token requires care in handling.  It is valid at the
208    // end of a numeric literal (where it signals use of the alternate
209    // base).  It is also valid in a symex.  But, it is also a macro
210    // terminator.  To handle these complexities, we include the dot
211    // in numeric literals (inside which spaces are not allowed).  But
212    // we do not allow Dot inside Symex syllables - with the idea that
213    // this will help us to correctly process them when used as macro
214    // terminators.
215    Dot(Script),
216    Comma(Script),
217}
218
219impl Display for Token {
220    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
221        let mut write_elevated = |script: &Script, s: &str| -> std::fmt::Result {
222            let el = elevate(*script, s);
223            write!(f, "{el}")
224        };
225
226        match self {
227            Token::Error(msg) => write!(f, "(error: {msg})"),
228            Token::LeftBrace(script) => write_elevated(script, "{"),
229            Token::RightBrace(script) => write_elevated(script, "}"),
230            Token::Newline => f.write_char('\n'),
231            Token::Tab => f.write_char('\t'),
232            Token::LeftParen(script) => write_elevated(script, "("),
233            Token::RightParen(script) => write_elevated(script, ")"),
234            Token::Hold => f.write_char('h'),
235            Token::NotHold => f.write_char('ℏ'),
236            Token::Arrow(script) => write_elevated(script, "->"),
237            Token::Hand(script) => write_elevated(script, "☛"),
238            Token::Asterisk(script) => write_elevated(script, "*"),
239            Token::Dot(script) => write_elevated(script, DOT_STR),
240            Token::Hash(script) => write_elevated(script, "#"),
241            Token::Equals(script) => write_elevated(script, "="),
242            Token::Pipe(script) => write_elevated(script, "|"),
243            Token::DoublePipe(script) => write_elevated(script, "‖"), // U+2016
244            Token::ProperSuperset(script) => write_elevated(script, "⊃"), // U+2283
245            Token::SubsetOf(script) => write_elevated(script, "⊂"),   // U+2282
246            Token::IdenticalTo(script) => write_elevated(script, "≡"),
247            Token::Tilde(script) => write_elevated(script, "~"),
248            Token::LessThan(script) => write_elevated(script, "<"),
249            Token::GreaterThan(script) => write_elevated(script, ">"),
250            Token::Query(script) => write_elevated(script, "?"),
251            Token::Intersection(script) => write_elevated(script, "∩"),
252            Token::Union(script) => write_elevated(script, "∪"),
253            Token::Solidus(script) => write_elevated(script, "/"),
254            Token::Plus(script) => write_elevated(script, "+"),
255            Token::Minus(script) => write_elevated(script, "-"),
256            Token::Times(script) => write_elevated(script, "×"),
257            Token::LogicalOr(script) => write_elevated(script, "∨"),
258            Token::LogicalAnd(script) => write_elevated(script, "∧"),
259            Token::Digits(script, numeric_literal) => {
260                write!(f, "{}", elevate(*script, numeric_literal.to_string()))
261            }
262            Token::BitPosition(script, quarter, bit) => {
263                let q_string = elevate(*script, quarter.to_string());
264                let bit_string = elevate(*script, bit.to_string());
265                let dotname = match script {
266                    Script::Normal => "@dot@",
267                    Script::Sub => "@sub_dot@",
268                    Script::Super => "@sup_dot@",
269                };
270                write!(f, "{q_string}{dotname}{bit_string}")
271            }
272            Token::SymexSyllable(script, name) => {
273                #[allow(clippy::unnecessary_wraps)]
274                fn nochange(ch: char) -> Result<char, ()> {
275                    Ok(ch)
276                }
277                fn convert_to_sup(ch: char) -> Result<char, ()> {
278                    superscript_char(ch).map_err(|_| ())
279                }
280                fn convert_to_sub(ch: char) -> Result<char, ()> {
281                    subscript_char(ch).map_err(|_| ())
282                }
283                type Transformer = fn(char) -> Result<char, ()>;
284                let (prefix, transform): (&'static str, Transformer) = match script {
285                    Script::Super => ("super_", convert_to_sup),
286                    Script::Normal => ("", nochange),
287                    Script::Sub => ("sub_", convert_to_sub),
288                };
289                for ch in name.chars() {
290                    match transform(ch) {
291                        Ok(sup_ch) => f.write_char(sup_ch),
292                        Err(()) => match ch {
293                            'α' => write!(f, "@{prefix}alpha@"),
294                            'β' => write!(f, "@{prefix}beta@"),
295                            'γ' => write!(f, "@{prefix}gamma@"),
296                            'Δ' => write!(f, "@{prefix}delta@"),
297                            'ε' => write!(f, "@{prefix}eps@"),
298                            'λ' => write!(f, "@{prefix}lambda@"),
299                            _ => write!(f, "@{prefix}{ch}@"),
300                        },
301                    }?;
302                }
303                Ok(())
304            }
305            Token::Comma(script) => write_elevated(script, ","),
306        }
307    }
308}
309
310/// Convert a string into a sequence of [`Elevated<&Glyph>`].
311#[derive(Debug, Clone)]
312struct GlyphRecognizer<'a> {
313    it: CharIndices<'a>,
314    pos: usize,
315    glyph_start: usize,
316}
317
318impl<'a> GlyphRecognizer<'a> {
319    fn new(input: &'a str) -> GlyphRecognizer<'a> {
320        Self {
321            it: input.char_indices(),
322            pos: 0,
323            glyph_start: 0,
324        }
325    }
326
327    fn get_next_char(&mut self) -> Option<char> {
328        match self.it.next() {
329            None => None,
330            Some((i, ch)) => {
331                self.pos = i;
332                Some(ch)
333            }
334        }
335    }
336
337    fn span(&self) -> Span {
338        self.glyph_start..(self.it.offset())
339    }
340
341    fn next_named_glyph(&mut self) -> Option<Result<Elevated<&'static Glyph>, Unrecognised>> {
342        let mut name: String = String::with_capacity(10);
343        let mut got_anything = false;
344        while let Some(ch) = self.get_next_char() {
345            got_anything = true;
346            if ch == '@' {
347                break;
348            }
349            name.push(ch);
350        }
351
352        // If the input was @@, (that is, the glyph name is
353        // zero-length) name is empty but got_anything is
354        // (correctly) true.
355        if got_anything {
356            Some(match glyph_from_name(name.as_str()) {
357                Some(g) => Ok(g),
358                None => Err(Unrecognised::UnrecognisedGlyph(name)),
359            })
360        } else {
361            None
362        }
363    }
364}
365
366impl Iterator for GlyphRecognizer<'_> {
367    type Item = Result<Elevated<&'static Glyph>, Unrecognised>;
368
369    fn next(&mut self) -> Option<Self::Item> {
370        let ch = self.get_next_char()?;
371        self.glyph_start = self.pos;
372        match ch {
373            '@' => match self.next_named_glyph() {
374                None => {
375                    // There actually was input, but it was only a
376                    // single '@'.  That is not in the Lincoln
377                    // Writer character set.
378                    Some(Err(Unrecognised::InvalidChar('@')))
379                }
380                something => something,
381            },
382            ch => Some(glyph_of_char(ch)),
383        }
384    }
385}
386
387#[test]
388fn test_glyph_recognizer_next() {
389    let mut gr = GlyphRecognizer::new("W");
390    match gr.next() {
391        Some(Ok(elev)) => {
392            assert_eq!(elev.script(), Script::Normal);
393            assert_eq!(elev.get().name, "W");
394        }
395        bad => {
396            panic!("glyph should not have been recognised as {bad:?}");
397        }
398    }
399    assert_eq!(gr.next(), None);
400}
401
402#[cfg(test)]
403fn assert_glyph(
404    got: Elevated<&'static Glyph>,
405    expected_shape: GlyphShape,
406    expected_script: Script,
407) {
408    assert_eq!(got.script(), expected_script, "wrong script for {got:?}");
409    assert_eq!(got.get().shape(), expected_shape, "wrong shape for {got:?}");
410}
411
412#[test]
413fn test_glyph_scanning() {
414    let mut scanner = GlyphRecognizer::new("hs@sub_eps@@hamb@@sup_add@@nosuch@ ");
415    // h is in the Lincoln Writer character set.
416    assert_glyph(
417        scanner.next().expect("input").expect("in character set"),
418        GlyphShape::h,
419        Script::Normal,
420    );
421    // s is not in the Lincoln Writer character set.
422    assert_eq!(scanner.next(), Some(Err(Unrecognised::InvalidChar('s'))),);
423    assert_glyph(
424        scanner.next().expect("input").expect("in character set"),
425        GlyphShape::Epsilon,
426        Script::Sub,
427    );
428    assert_glyph(
429        scanner.next().expect("input").expect("in character set"),
430        GlyphShape::IdenticalTo,
431        Script::Normal,
432    );
433    assert_glyph(
434        scanner.next().expect("input").expect("in character set"),
435        GlyphShape::Add,
436        Script::Super,
437    );
438    assert_eq!(
439        scanner.next(),
440        Some(Err(Unrecognised::UnrecognisedGlyph("nosuch".to_string())))
441    );
442    assert_glyph(
443        scanner.next().expect("input").expect("in character set"),
444        GlyphShape::Space,
445        Script::Normal,
446    );
447    assert_eq!(scanner.next(), None);
448    // Verify that detection of end-of-input is sticky.
449    assert_eq!(scanner.next(), None);
450}
451
452fn tokenise_single_glyph(g: Elevated<&'static Glyph>) -> Option<Token> {
453    let script: Script = g.script();
454
455    let make_num = |ch: char| {
456        let literal = NumericLiteral {
457            digits: {
458                let mut s = String::with_capacity(12);
459                s.push(ch);
460                s
461            },
462            has_trailing_dot: false,
463        };
464        Token::Digits(script, literal)
465    };
466    let make_symex = || -> Option<Token> {
467        // The symex token always gives the characters in normal
468        // script.  The superscript/subscript information is
469        // carried in the token variant
470        // (e.g. SuperscriptSymexsyllable).
471        let name: String = g.get().get_char(Script::Normal).iter().collect();
472        // We do not use name.len() here because the number of
473        // bytes in the name is not relevant, only the number of
474        // Unicode code points.
475        match name.chars().count() {
476            0 => {
477                panic!(
478                    "incoming token '{g:?}' was assigned as part of a symex syllable, but we don't have a character for it in script {script:?}"
479                );
480            }
481            1 => (),
482            n => {
483                panic!(
484                    "incoming token '{g:?}' was assigned as part of a symex syllable, but the resuting initial token body unexpectedly has more than one character (specifically, {n}): {name:?}"
485                );
486            }
487        }
488        Some(Token::SymexSyllable(script, name))
489    };
490
491    // In the grammar described in section 6 of the Users Handbook,
492    // space and tab are not handled in quite the same way.  Space is
493    // allowed in symexes, but tab is not (tab terminates a symex).
494    #[allow(clippy::match_same_arms)] // easier to read in existing order
495    let output: Option<Token> = match g.get().shape() {
496        GlyphShape::Space | GlyphShape::Tab => None,
497        GlyphShape::Digit0 => Some(make_num('0')),
498        GlyphShape::Digit1 => Some(make_num('1')),
499        GlyphShape::Digit2 => Some(make_num('2')),
500        GlyphShape::Digit3 => Some(make_num('3')),
501        GlyphShape::Digit4 => Some(make_num('4')),
502        GlyphShape::Digit5 => Some(make_num('5')),
503        GlyphShape::Digit6 => Some(make_num('6')),
504        GlyphShape::Digit7 => Some(make_num('7')),
505        GlyphShape::Digit8 => Some(make_num('8')),
506        GlyphShape::Digit9 => Some(make_num('9')),
507        GlyphShape::Underscore
508        | GlyphShape::Circle
509        | GlyphShape::A
510        | GlyphShape::B
511        | GlyphShape::C
512        | GlyphShape::D
513        | GlyphShape::E
514        | GlyphShape::F
515        | GlyphShape::G
516        | GlyphShape::H
517        | GlyphShape::I
518        | GlyphShape::J
519        | GlyphShape::K
520        | GlyphShape::L
521        | GlyphShape::M
522        | GlyphShape::N
523        | GlyphShape::O
524        | GlyphShape::P
525        | GlyphShape::Q
526        | GlyphShape::R
527        | GlyphShape::S
528        | GlyphShape::T
529        | GlyphShape::U
530        | GlyphShape::V
531        | GlyphShape::W
532        | GlyphShape::X
533        | GlyphShape::Y
534        | GlyphShape::Z => make_symex(),
535        GlyphShape::LeftParen => Some(Token::LeftParen(script)),
536        GlyphShape::RightParen => Some(Token::RightParen(script)),
537        GlyphShape::Add => Some(Token::Plus(script)),
538        GlyphShape::Minus => Some(Token::Minus(script)),
539        GlyphShape::Comma => Some(Token::Comma(script)),
540        GlyphShape::Dot => Some(Token::Dot(script)),
541        GlyphShape::Backspace => unimplemented!("compound characters are not yet supported"),
542        GlyphShape::Hand => Some(Token::Hand(script)),
543        GlyphShape::Sigma => {
544            todo!("Sigma (which is a symex terminator) does not yet have a token")
545        }
546        GlyphShape::Pipe => Some(Token::Pipe(script)),
547        GlyphShape::DoublePipe => Some(Token::DoublePipe(script)),
548        GlyphShape::Solidus => Some(Token::Solidus(script)),
549        GlyphShape::Times => Some(Token::Times(script)),
550        GlyphShape::Hash => Some(Token::Hash(script)),
551        GlyphShape::Arrow => Some(Token::Arrow(script)),
552        GlyphShape::LessThan => Some(Token::LessThan(script)),
553        GlyphShape::GreaterThan => Some(Token::GreaterThan(script)),
554        GlyphShape::Overbar | GlyphShape::Square | GlyphShape::n => make_symex(),
555        GlyphShape::SubsetOf => Some(Token::SubsetOf(script)),
556        GlyphShape::Or => Some(Token::LogicalOr(script)),
557        GlyphShape::q
558        | GlyphShape::Gamma
559        | GlyphShape::t
560        | GlyphShape::w
561        | GlyphShape::x
562        | GlyphShape::i
563        | GlyphShape::y
564        | GlyphShape::z => make_symex(),
565        GlyphShape::Query => Some(Token::Query(script)),
566        GlyphShape::Union => Some(Token::Union(script)),
567        GlyphShape::Intersection => Some(Token::Intersection(script)),
568        GlyphShape::j | GlyphShape::k => make_symex(),
569        GlyphShape::Alpha => make_symex(),
570        GlyphShape::Delta => make_symex(),
571        GlyphShape::p => make_symex(),
572        GlyphShape::Epsilon => make_symex(),
573        GlyphShape::h => Some(match script {
574            // h cannot be part of a symex (see Users Handbook,
575            // section 6-2.3).
576            Script::Super | Script::Sub => unimplemented!(),
577            Script::Normal => Token::Hold,
578        }),
579        // Todo: Token::NotHold.
580        GlyphShape::SupersetOf => Some(Token::ProperSuperset(script)),
581        GlyphShape::Beta => make_symex(),
582        GlyphShape::And => Some(Token::LogicalAnd(script)),
583        GlyphShape::Lambda => make_symex(),
584        GlyphShape::Tilde => Some(Token::Tilde(script)),
585        GlyphShape::LeftBrace => Some(Token::LeftBrace(script)),
586        GlyphShape::RightBrace => Some(Token::RightBrace(script)),
587        GlyphShape::IdenticalTo => Some(Token::IdenticalTo(script)),
588        GlyphShape::Equals => Some(Token::Equals(script)),
589        GlyphShape::Apostrophe => make_symex(),
590        GlyphShape::Asterisk => Some(Token::Asterisk(script)),
591    };
592    if let Some(t) = output.as_ref() {
593        if matches!(t, Token::SymexSyllable(_, _)) {
594            assert!(
595                is_allowed_in_symex(g.get().shape),
596                "attempted to make a symex with disallowed glyph shape {g:?}"
597            );
598        } else if matches!(t, Token::Digits(_, _) | Token::Dot(_)) {
599            // Digits and dots mostly called "PERIOD" in the Users
600            // Handbook) are allowed in numeric literals but also
601            // allowed in symexes.  We scan them as numeric
602            // literals and then join them into symexes when we
603            // discover what their neighbours are.
604            assert!(
605                is_allowed_in_symex(g.get().shape),
606                "all glyphs allowed in numeric literals are also allowed in symexes, but this went wrong for {g:?}"
607            );
608        } else if g.get().shape == GlyphShape::Space {
609            // Permitted in a symex but this is implemented at the parser level.
610        } else {
611            assert!(
612                !is_allowed_in_symex(g.get().shape),
613                "glyph shape {g:?} is allowed in a symex but the scanner didn't recognise it that way"
614            );
615        }
616    }
617    output
618}
619
620#[derive(Debug, PartialEq, Eq)]
621enum TokenMergeResult {
622    Merged(Token, Span),
623    Failed {
624        current: Token,
625        current_span: Span,
626        incoming: Token,
627        incoming_span: Span,
628    },
629}
630
631fn merge_tokens(current: (Token, Span), incoming: (Token, Span)) -> TokenMergeResult {
632    // We never merge errors with non-errors, so eliminate those
633    // cases and, when both inputs are Ok variants, unwrap.
634    let ((current, current_span), (incoming, incoming_span)) = (current, incoming);
635    if matches!(
636        (&current, &incoming),
637        (&Token::Error(_), _) | (_, &Token::Error(_))
638    ) {
639        return TokenMergeResult::Failed {
640            current,
641            current_span,
642            incoming,
643            incoming_span,
644        };
645    }
646
647    let merged_span = current_span.start..incoming_span.end;
648    match current {
649        Token::Minus(incoming_script)
650            if incoming == Token::GreaterThan(incoming_script)
651                && incoming_script == Script::Normal =>
652        {
653            TokenMergeResult::Merged(Token::Arrow(Script::Normal), merged_span)
654        }
655        Token::SymexSyllable(existing_script, mut existing_name) => match incoming {
656            Token::Hold if existing_script == Script::Normal => {
657                // overbar followed by h means not-hold, and we handle this case specially.
658                if existing_name == "\u{0305}" {
659                    TokenMergeResult::Merged(Token::NotHold, merged_span)
660                } else {
661                    TokenMergeResult::Failed {
662                        current: Token::SymexSyllable(existing_script, existing_name),
663                        current_span,
664                        incoming: Token::Hold,
665                        incoming_span,
666                    }
667                }
668            }
669            Token::SymexSyllable(incoming_script, incoming_name)
670                if existing_script == incoming_script =>
671            {
672                existing_name.push_str(&incoming_name);
673                TokenMergeResult::Merged(
674                    Token::SymexSyllable(existing_script, existing_name),
675                    merged_span,
676                )
677            }
678            Token::Digits(incoming_script, literal) if existing_script == incoming_script => {
679                existing_name.push_str(&literal.digits);
680                if literal.has_trailing_dot {
681                    existing_name.push(DOT_CHAR);
682                }
683                TokenMergeResult::Merged(
684                    Token::SymexSyllable(existing_script, existing_name),
685                    merged_span,
686                )
687            }
688            other => TokenMergeResult::Failed {
689                current: Token::SymexSyllable(existing_script, existing_name),
690                current_span,
691                incoming: other,
692                incoming_span,
693            },
694        },
695        Token::Digits(existing_script, mut existing_literal) => {
696            if existing_literal.has_trailing_dot {
697                // The left-hand literal has a dot.  So the valid
698                // cases are where we're part-way through a symex, or
699                // a bit position specification.
700                match incoming {
701                    Token::Digits(
702                        Script::Sub,
703                        NumericLiteral {
704                            digits: incoming_digit,
705                            has_trailing_dot: false,
706                        },
707                    ) if existing_script == Script::Sub => TokenMergeResult::Merged(
708                        Token::BitPosition(
709                            existing_script,
710                            existing_literal.digits,
711                            incoming_digit,
712                        ),
713                        merged_span,
714                    ),
715                    // Not valid for RHS to be Dot, as we already have one.
716                    other => TokenMergeResult::Failed {
717                        current: Token::Digits(existing_script, existing_literal),
718                        current_span,
719                        incoming: other,
720                        incoming_span,
721                    },
722                }
723            } else {
724                match incoming {
725                    Token::Digits(incoming_script, incoming_name)
726                        if existing_script == incoming_script =>
727                    {
728                        existing_literal.append_digits_of_literal(&incoming_name);
729                        TokenMergeResult::Merged(
730                            Token::Digits(existing_script, existing_literal),
731                            merged_span,
732                        )
733                    }
734                    Token::Dot(right_script)
735                        if existing_script == right_script
736                            && !existing_literal.has_trailing_dot =>
737                    {
738                        existing_literal.has_trailing_dot = true;
739                        TokenMergeResult::Merged(
740                            Token::Digits(existing_script, existing_literal),
741                            merged_span,
742                        )
743                    }
744                    Token::SymexSyllable(incoming_script, sym)
745                        if existing_script == incoming_script =>
746                    {
747                        let mut existing_name: String = existing_literal.digits;
748                        existing_name.push_str(&sym);
749                        TokenMergeResult::Merged(
750                            Token::SymexSyllable(existing_script, existing_name),
751                            merged_span,
752                        )
753                    }
754                    other => TokenMergeResult::Failed {
755                        current: Token::Digits(existing_script, existing_literal),
756                        current_span,
757                        incoming: other,
758                        incoming_span,
759                    },
760                }
761            }
762        }
763        Token::BitPosition(existing_script, existing_quarter, mut existing_bit) => match incoming {
764            Token::Digits(
765                incoming_script,
766                NumericLiteral {
767                    digits: incoming_digit,
768                    has_trailing_dot: false,
769                },
770            ) if existing_script == incoming_script => {
771                existing_bit.push_str(incoming_digit.as_str());
772                TokenMergeResult::Merged(
773                    Token::BitPosition(existing_script, existing_quarter, existing_bit),
774                    merged_span,
775                )
776            }
777            Token::Dot(incoming_script) if existing_script == incoming_script => {
778                let name = format!("{existing_quarter}\u{00B7}{existing_bit}\u{00B7}");
779                TokenMergeResult::Merged(Token::SymexSyllable(Script::Sub, name), merged_span)
780            }
781            Token::SymexSyllable(incoming_script, symbol) if existing_script == incoming_script => {
782                let name = format!("{existing_quarter}\u{00B7}{existing_bit}{symbol}");
783                TokenMergeResult::Merged(Token::SymexSyllable(Script::Sub, name), merged_span)
784            }
785            other => TokenMergeResult::Failed {
786                current: Token::BitPosition(existing_script, existing_quarter, existing_bit),
787                current_span,
788                incoming: other,
789                incoming_span,
790            },
791        },
792        existing => TokenMergeResult::Failed {
793            current: existing,
794            current_span,
795            incoming,
796            incoming_span,
797        },
798    }
799}
800
801/// Tokenize a small part of the input, unifying the Unicode and
802/// `@...@` representations.
803#[derive(Debug, Clone)]
804struct GlyphTokenizer<'a> {
805    current: Option<(Token, Span)>,
806    inner: GlyphRecognizer<'a>,
807}
808
809impl<'a> GlyphTokenizer<'a> {
810    fn new(input: &'a str) -> GlyphTokenizer<'a> {
811        GlyphTokenizer {
812            current: None,
813            inner: GlyphRecognizer::new(input),
814        }
815    }
816
817    fn get_next_spanned_token(&mut self) -> Option<(Token, Span)> {
818        loop {
819            let maybe_spanned_new_token: Option<(Token, Span)> = match self.inner.next() {
820                None => None,
821                Some(Err(Unrecognised::InvalidChar('ℏ'))) => {
822                    // ℏ is Unicode code point U+210F.  There
823                    // is no glyph matching ℏ (because on the
824                    // TX-2 this was produced with an overbar
825                    // (which does not advance the carriage)
826                    // and a regular h.  We accept it as a
827                    // special case.
828                    //
829                    // Because there is no Glyph for this, we
830                    // do not accept @...@ (e.g. @hbar@) for
831                    // this.
832                    return Some((Token::NotHold, self.inner.span()));
833                }
834                Some(Err(e)) => {
835                    let error_token = Token::Error(ErrorTokenKind::UnrecognisedGlyph(e));
836                    Some((error_token, self.inner.span()))
837                }
838                Some(Ok(g)) => {
839                    if matches!(g.get().shape(), GlyphShape::Space | GlyphShape::Tab) {
840                        match self.current.take() {
841                            Some(r) => {
842                                return Some(r);
843                            }
844                            None => {
845                                continue;
846                            }
847                        }
848                    }
849                    let tok_start = self.inner.span().start;
850                    match tokenise_single_glyph(g) {
851                        Some(token) => {
852                            let span = tok_start..self.inner.span().end;
853                            Some((token, span))
854                        }
855                        None => {
856                            unimplemented!("unable) to convert glyph '{g:?}' to a token")
857                        }
858                    }
859                }
860            };
861            if let Some((newtoken, newtoken_span)) = maybe_spanned_new_token {
862                if let Some((current, current_span)) = self.current.take() {
863                    match merge_tokens((current, current_span), (newtoken, newtoken_span)) {
864                        TokenMergeResult::Merged(merged, merged_span) => {
865                            self.current = Some((merged, merged_span));
866                        }
867                        TokenMergeResult::Failed {
868                            current,
869                            current_span,
870                            incoming: newtoken,
871                            incoming_span: newtoken_span,
872                        } => {
873                            let result = (current, current_span);
874                            self.current = Some((newtoken, newtoken_span));
875                            return Some(result);
876                        }
877                    }
878                } else {
879                    // There is a new token but no existing token.
880                    self.current = Some((newtoken, newtoken_span));
881                }
882            } else {
883                // There is no new token.
884                return self.current.take();
885            }
886        }
887    }
888}
889
890#[test]
891fn test_glyph_tokenizer_simple_multi_token() {
892    let mut lex = GlyphTokenizer::new("hx");
893    assert_eq!(lex.get_next_spanned_token(), Some((Token::Hold, 0..1)));
894    assert_eq!(
895        lex.get_next_spanned_token(),
896        Some((Token::SymexSyllable(Script::Normal, "x".to_string()), 1..2))
897    );
898    assert_eq!(lex.get_next_spanned_token(), None);
899}
900
901#[test]
902fn test_glyph_tokenizer_glyph_names() {
903    let mut lex = GlyphTokenizer::new("@sup_eps@");
904    assert_eq!(
905        lex.get_next_spanned_token(),
906        Some((Token::SymexSyllable(Script::Super, "ε".to_string()), 0..9))
907    );
908    assert_eq!(lex.get_next_spanned_token(), None);
909}
910
911#[test]
912fn test_glyph_tokenizer_multi_glyph_token() {
913    // These glyphs are a single token because they are both valid
914    // in a symex and are both in superscript.
915    let input = "@sup_eps@ᵂ";
916    let mut lex = GlyphTokenizer::new(input);
917    assert_eq!(
918        lex.get_next_spanned_token(),
919        Some((Token::SymexSyllable(Script::Super, "εW".to_string()), 0..12))
920    );
921    assert_eq!(lex.get_next_spanned_token(), None);
922}
923
924#[test]
925fn test_glyph_tokenizer_script_change_breaks_tokens() {
926    // Verify that a change from superscript to normal script
927    // causes two immediately adjoining letters to be considereed
928    // separate tokens.
929    let mut lex = GlyphTokenizer::new("@sup_eps@W");
930    assert_eq!(
931        lex.get_next_spanned_token(),
932        Some((Token::SymexSyllable(Script::Super, "ε".to_string()), 0..9))
933    );
934    assert_eq!(
935        lex.get_next_spanned_token(),
936        Some((Token::SymexSyllable(Script::Normal, "W".to_string()), 9..10))
937    );
938    assert_eq!(lex.get_next_spanned_token(), None);
939}
940
941#[test]
942fn test_glyph_tokenizer_space_breaks_tokens() {
943    // Verify that a space breaks tokens.  For symexes, the parser
944    // will join the syllables together into a single name, but
945    // they are scanned as separate tokens.
946    let mut lex = GlyphTokenizer::new("W Q");
947    assert_eq!(
948        lex.get_next_spanned_token(),
949        Some((Token::SymexSyllable(Script::Normal, "W".to_string()), 0..1))
950    );
951    assert_eq!(
952        lex.get_next_spanned_token(),
953        Some((Token::SymexSyllable(Script::Normal, "Q".to_string()), 2..3))
954    );
955    assert_eq!(lex.get_next_spanned_token(), None);
956}
957
958#[test]
959fn test_glyph_tokenizer_question_mark() {
960    let mut lex = GlyphTokenizer::new("?");
961    assert_eq!(
962        lex.get_next_spanned_token(),
963        Some((Token::Query(Script::Normal), 0..1))
964    );
965}
966
967/// Tokenize the body of a source code file.
968///
969/// This is the lexer interface intended for use by the parser.
970///
971/// We use [`lower::LowerLexer`] to handle comments, line continuation
972/// and nesting of RC-blocks, and we use [`GlyphTokenizer`] to unify
973/// the several equivalent representations of some symbols (e.g. `₂`
974/// and `@sub_2@`).
975#[derive(Debug, Clone)]
976pub(crate) struct Lexer<'a> {
977    lower: lower::LowerLexer<'a>,
978    upper: Option<GlyphTokenizer<'a>>,
979    upper_span: Option<Span>,
980}
981
982impl<'a> Lexer<'a> {
983    pub(crate) fn new(input: &'a str) -> Lexer<'a> {
984        Lexer {
985            lower: lower::LowerLexer::new(input),
986            upper: None,
987            upper_span: None,
988        }
989    }
990
991    fn adjust_span(&self, span: Range<usize>) -> Range<usize> {
992        match self.upper_span.as_ref() {
993            None => span,
994            Some(upper_span) => {
995                let offset = span.start;
996                (upper_span.start + offset)..(upper_span.end + offset)
997            }
998        }
999    }
1000
1001    pub(crate) fn span(&self) -> Range<usize> {
1002        self.adjust_span(self.lower.span())
1003    }
1004
1005    pub(crate) fn spanned(&self) -> SpannedIter<'a> {
1006        let lexer: Lexer<'a> = self.clone();
1007        SpannedIter::new(lexer)
1008    }
1009
1010    fn get_next(&mut self) -> Option<Token> {
1011        use lower::Lexeme;
1012        // If have more input from the upper lexer, use it.
1013        if let Some(upper_lexer) = self.upper.as_mut()
1014            && let Some((r, span)) = upper_lexer.get_next_spanned_token()
1015        {
1016            self.upper_span = Some(span);
1017            return Some(r);
1018        }
1019
1020        // Fetch more text from the lower lexer.
1021        self.upper = None;
1022        self.upper_span = None;
1023        match self.lower.next() {
1024            Lexeme::EndOfInput => None,
1025            Lexeme::Tok(tok) => Some(tok),
1026            Lexeme::Text(text) => {
1027                let upper = GlyphTokenizer::new(text);
1028                self.upper = Some(upper);
1029                match self
1030                    .upper
1031                    .as_mut()
1032                    .expect("the option cannot be empty, we just filled it")
1033                    .get_next_spanned_token()
1034                {
1035                    Some((r, span)) => {
1036                        self.upper_span = Some(span);
1037                        Some(r)
1038                    }
1039                    None => None,
1040                }
1041            }
1042            Lexeme::Err(e) => Some(Token::Error(ErrorTokenKind::UnrecognisedGlyph(e))),
1043        }
1044    }
1045}
1046
1047impl Iterator for Lexer<'_> {
1048    type Item = Token;
1049
1050    fn next(&mut self) -> Option<Token> {
1051        self.get_next()
1052    }
1053}
1054
1055#[derive(Debug, Clone)]
1056pub(crate) struct SpannedIter<'a> {
1057    lexer: Lexer<'a>,
1058}
1059
1060impl<'a> SpannedIter<'a> {
1061    pub(crate) fn new(lexer: Lexer<'a>) -> SpannedIter<'a> {
1062        SpannedIter { lexer }
1063    }
1064}
1065
1066impl Iterator for SpannedIter<'_> {
1067    type Item = (Token, Span);
1068
1069    fn next(&mut self) -> Option<Self::Item> {
1070        let token = self.lexer.next();
1071        token.map(|t| (t, self.lexer.span()))
1072    }
1073}
assembler/lexer.rs

assembler/
lexer.rs