assembler/lexer/
lower.rs

1//! A "partial" lexer which determines whether we're inside an
2//! RC-block or a comment.
3use std::ops::Range;
4
5use logos::Logos;
6
7use base::charset::Script;
8
9use super::super::glyph::Unrecognised;
10
11/// `InnerToken` is the result of a "partial" lexer which only
12/// identifies enough tokens to determine whether we're inside an
13/// RC-block or a comment.  We do this in order to handle
14/// differing interpretations of '}' within a comment; if the
15/// comment is inside an RC-block, then '}' terminates the
16/// RC-block.  But if the comment is not inside an RC-block, then
17/// '}' is not special and forms part of the comment.
18#[derive(Debug, Logos, PartialEq, Clone, Copy)]
19pub(super) enum InnerToken {
20    #[regex("[*][*][^}\n]*")]
21    CommentStart,
22
23    #[token("\n")]
24    Newline,
25
26    /// We use annotations in source files where we want a comment which isn't part of
27    // the assembler syntax.  The interpretations of annotations may
28    // change in the future (but comments will not) so you should
29    // generally prefer to use a comment.
30    //
31    // Should be higher-priority than Text.
32    #[regex(r"\[[^]]*\]", priority = 5)]
33    Annotation,
34
35    // There is no @..@ syntax for left-brace, but if there were,
36    // we would need to match it here also.
37    #[token("{")]
38    LeftBrace,
39
40    #[token("}")]
41    RightBrace,
42
43    // The Regex crate allows escaping in character classes, which is
44    // something we need to do to use '[' in a (negated in this case)
45    // character class.
46    #[regex("[^ \\[\t{}\n]+", priority = 3)]
47    Text,
48
49    // We distinguish tab from spaces because they are handled
50    // differently.  Space is allowed inside symexes while tab is not.
51    #[token("\t")]
52    Tab,
53
54    #[regex("[ ]+")]
55    Spaces,
56}
57
58/// Keep track of whether we are in an RC-block, in a comment, or
59/// both.
60///
61/// RC-blocks nest, so we have to use a count in order to determine
62/// whether a '}' means we're no longer in an RC-block.
63#[derive(Debug, Default, Clone, Copy)]
64struct State {
65    /// Are we in a comment?
66    in_comment: bool,
67
68    /// Count of open braces.
69    lbrace_count: usize,
70}
71
72impl State {
73    fn check_set_up_for_start_of_line(&self) {
74        assert!(!self.in_comment);
75        assert_eq!(self.lbrace_count, 0);
76    }
77}
78
79/// This is the output of `LowerLexer`.
80#[derive(Debug, PartialEq, Eq)]
81pub(super) enum Lexeme<'a> {
82    EndOfInput,
83    Tok(super::Token),
84    Text(&'a str),
85    Err(Unrecognised),
86}
87
88/// `LowerLexer` uses a Logos-generated scanner to identify braces
89/// and comments, and keeps track of whether we are in an RC-block
90/// or a comment.  Other text is returned as-is.
91#[derive(Debug, Clone)]
92pub(super) struct LowerLexer<'a> {
93    inner: logos::Lexer<'a, InnerToken>,
94    state: State,
95}
96
97impl<'a> LowerLexer<'a> {
98    pub(super) fn new(input: &'a str) -> LowerLexer<'a> {
99        let result = LowerLexer {
100            inner: InnerToken::lexer(input),
101            state: Default::default(),
102        };
103        result.state.check_set_up_for_start_of_line();
104        result
105    }
106
107    pub(crate) fn span(&self) -> Range<usize> {
108        self.inner.span()
109    }
110
111    pub(super) fn next(&mut self) -> Lexeme<'a> {
112        use super::Token;
113
114        loop {
115            let tok = match self.inner.next() {
116                None => {
117                    return Lexeme::EndOfInput;
118                }
119                Some(Result::Err(())) => {
120                    if self.state.in_comment {
121                        // Skip.
122                        continue;
123                    }
124                    match self.inner.slice().chars().next() {
125                        Some(ch) => {
126                            return Lexeme::Err(Unrecognised::InvalidChar(ch));
127                        }
128                        None => {
129                            panic!("LowerLexer::next(): got error on zero-length content");
130                        }
131                    }
132                }
133                Some(Ok(tok)) => tok,
134            };
135            match tok {
136                InnerToken::Spaces | InnerToken::Annotation => {
137                    // Skip.
138                }
139                InnerToken::Tab => {
140                    // Parse tab differently to avoid joining symex
141                    // symbols across a space.  Per section 6-2.3 of
142                    // the User Handbook, tab is not allowed inside a
143                    // symex.
144                    return Lexeme::Tok(Token::Tab);
145                }
146                InnerToken::CommentStart => {
147                    self.state.in_comment = true;
148                }
149                InnerToken::LeftBrace => {
150                    if self.state.in_comment {
151                        // Left brace inside a comment is not special,
152                        // so we continue skipping the comment text.
153                        continue;
154                    }
155                    self.state.lbrace_count = self.state.lbrace_count.checked_add(1).expect(
156                        "the number of '{' on one line should be countable without overflow",
157                    );
158                    return Lexeme::Tok(Token::LeftBrace(Script::Normal));
159                }
160                InnerToken::RightBrace => {
161                    match self.state.lbrace_count.checked_sub(1) {
162                        None => {
163                            if self.state.in_comment {
164                                // Right brace inside a comment, but
165                                // there was no previous left brace.
166                                // Hence the } doesn't terminate an
167                                // RC-block.  So we continue skipping
168                                // the comment text.
169                                continue;
170                            }
171                            return Lexeme::Tok(Token::RightBrace(Script::Normal));
172                        }
173                        Some(reduced_count) => {
174                            self.state.lbrace_count = reduced_count;
175                            self.state.in_comment = false;
176                            return Lexeme::Tok(Token::RightBrace(Script::Normal));
177                        }
178                    }
179                }
180                InnerToken::Newline => {
181                    self.state.lbrace_count = 0;
182                    self.state.in_comment = false;
183                    self.state.check_set_up_for_start_of_line();
184                    return Lexeme::Tok(Token::Newline);
185                }
186                InnerToken::Text => {
187                    if self.state.in_comment {
188                        continue;
189                    }
190                    return Lexeme::Text(self.inner.slice());
191                }
192            }
193        }
194    }
195}
196
197#[test]
198fn test_annotations_are_ignored() {
199    let input = "->[THIS IS AN ANNOTATION]";
200    let mut lex = LowerLexer::new(input);
201    assert_eq!(lex.next(), Lexeme::Text("->"));
202    assert_eq!(lex.next(), Lexeme::EndOfInput);
203}
204
205#[test]
206fn test_span() {
207    let input = "XZ Y";
208    let mut lex = LowerLexer::new(input);
209    assert_eq!(lex.next(), Lexeme::Text("XZ"));
210    assert_eq!(&input[lex.span()], "XZ");
211    assert_eq!(lex.next(), Lexeme::Text("Y"));
212    assert_eq!(&input[lex.span()], "Y");
213}