assembler/lexer/lower.rs
1//! A "partial" lexer which determines whether we're inside an
2//! RC-block or a comment.
3use std::ops::Range;
4
5use logos::Logos;
6
7use base::charset::Script;
8
9use super::super::glyph::Unrecognised;
10
11/// `InnerToken` is the result of a "partial" lexer which only
12/// identifies enough tokens to determine whether we're inside an
13/// RC-block or a comment. We do this in order to handle
14/// differing interpretations of '}' within a comment; if the
15/// comment is inside an RC-block, then '}' terminates the
16/// RC-block. But if the comment is not inside an RC-block, then
17/// '}' is not special and forms part of the comment.
18#[derive(Debug, Logos, PartialEq, Clone, Copy)]
19pub(super) enum InnerToken {
20 #[regex("[*][*][^}\n]*")]
21 CommentStart,
22
23 #[token("\n")]
24 Newline,
25
26 /// We use annotations in source files where we want a comment which isn't part of
27 // the assembler syntax. The interpretations of annotations may
28 // change in the future (but comments will not) so you should
29 // generally prefer to use a comment.
30 //
31 // Should be higher-priority than Text.
32 #[regex(r"\[[^]]*\]", priority = 5)]
33 Annotation,
34
35 // There is no @..@ syntax for left-brace, but if there were,
36 // we would need to match it here also.
37 #[token("{")]
38 LeftBrace,
39
40 #[token("}")]
41 RightBrace,
42
43 // The Regex crate allows escaping in character classes, which is
44 // something we need to do to use '[' in a (negated in this case)
45 // character class.
46 #[regex("[^ \\[\t{}\n]+", priority = 3)]
47 Text,
48
49 // We distinguish tab from spaces because they are handled
50 // differently. Space is allowed inside symexes while tab is not.
51 #[token("\t")]
52 Tab,
53
54 #[regex("[ ]+")]
55 Spaces,
56}
57
58/// Keep track of whether we are in an RC-block, in a comment, or
59/// both.
60///
61/// RC-blocks nest, so we have to use a count in order to determine
62/// whether a '}' means we're no longer in an RC-block.
63#[derive(Debug, Default, Clone, Copy)]
64struct State {
65 /// Are we in a comment?
66 in_comment: bool,
67
68 /// Count of open braces.
69 lbrace_count: usize,
70}
71
72impl State {
73 fn check_set_up_for_start_of_line(&self) {
74 assert!(!self.in_comment);
75 assert_eq!(self.lbrace_count, 0);
76 }
77}
78
79/// This is the output of `LowerLexer`.
80#[derive(Debug, PartialEq, Eq)]
81pub(super) enum Lexeme<'a> {
82 EndOfInput,
83 Tok(super::Token),
84 Text(&'a str),
85 Err(Unrecognised),
86}
87
88/// `LowerLexer` uses a Logos-generated scanner to identify braces
89/// and comments, and keeps track of whether we are in an RC-block
90/// or a comment. Other text is returned as-is.
91#[derive(Debug, Clone)]
92pub(super) struct LowerLexer<'a> {
93 inner: logos::Lexer<'a, InnerToken>,
94 state: State,
95}
96
97impl<'a> LowerLexer<'a> {
98 pub(super) fn new(input: &'a str) -> LowerLexer<'a> {
99 let result = LowerLexer {
100 inner: InnerToken::lexer(input),
101 state: Default::default(),
102 };
103 result.state.check_set_up_for_start_of_line();
104 result
105 }
106
107 pub(crate) fn span(&self) -> Range<usize> {
108 self.inner.span()
109 }
110
111 pub(super) fn next(&mut self) -> Lexeme<'a> {
112 use super::Token;
113
114 loop {
115 let tok = match self.inner.next() {
116 None => {
117 return Lexeme::EndOfInput;
118 }
119 Some(Result::Err(())) => {
120 if self.state.in_comment {
121 // Skip.
122 continue;
123 }
124 match self.inner.slice().chars().next() {
125 Some(ch) => {
126 return Lexeme::Err(Unrecognised::InvalidChar(ch));
127 }
128 None => {
129 panic!("LowerLexer::next(): got error on zero-length content");
130 }
131 }
132 }
133 Some(Ok(tok)) => tok,
134 };
135 match tok {
136 InnerToken::Spaces | InnerToken::Annotation => {
137 // Skip.
138 }
139 InnerToken::Tab => {
140 // Parse tab differently to avoid joining symex
141 // symbols across a space. Per section 6-2.3 of
142 // the User Handbook, tab is not allowed inside a
143 // symex.
144 return Lexeme::Tok(Token::Tab);
145 }
146 InnerToken::CommentStart => {
147 self.state.in_comment = true;
148 }
149 InnerToken::LeftBrace => {
150 if self.state.in_comment {
151 // Left brace inside a comment is not special,
152 // so we continue skipping the comment text.
153 continue;
154 }
155 self.state.lbrace_count = self.state.lbrace_count.checked_add(1).expect(
156 "the number of '{' on one line should be countable without overflow",
157 );
158 return Lexeme::Tok(Token::LeftBrace(Script::Normal));
159 }
160 InnerToken::RightBrace => {
161 match self.state.lbrace_count.checked_sub(1) {
162 None => {
163 if self.state.in_comment {
164 // Right brace inside a comment, but
165 // there was no previous left brace.
166 // Hence the } doesn't terminate an
167 // RC-block. So we continue skipping
168 // the comment text.
169 continue;
170 }
171 return Lexeme::Tok(Token::RightBrace(Script::Normal));
172 }
173 Some(reduced_count) => {
174 self.state.lbrace_count = reduced_count;
175 self.state.in_comment = false;
176 return Lexeme::Tok(Token::RightBrace(Script::Normal));
177 }
178 }
179 }
180 InnerToken::Newline => {
181 self.state.lbrace_count = 0;
182 self.state.in_comment = false;
183 self.state.check_set_up_for_start_of_line();
184 return Lexeme::Tok(Token::Newline);
185 }
186 InnerToken::Text => {
187 if self.state.in_comment {
188 continue;
189 }
190 return Lexeme::Text(self.inner.slice());
191 }
192 }
193 }
194 }
195}
196
197#[test]
198fn test_annotations_are_ignored() {
199 let input = "->[THIS IS AN ANNOTATION]";
200 let mut lex = LowerLexer::new(input);
201 assert_eq!(lex.next(), Lexeme::Text("->"));
202 assert_eq!(lex.next(), Lexeme::EndOfInput);
203}
204
205#[test]
206fn test_span() {
207 let input = "XZ Y";
208 let mut lex = LowerLexer::new(input);
209 assert_eq!(lex.next(), Lexeme::Text("XZ"));
210 assert_eq!(&input[lex.span()], "XZ");
211 assert_eq!(lex.next(), Lexeme::Text("Y"));
212 assert_eq!(&input[lex.span()], "Y");
213}