1use std::{
10 fmt::{Display, Write},
11 ops::Range,
12 str::CharIndices,
13};
14
15use base::{
16 Unsigned36Bit,
17 charset::{Script, subscript_char, superscript_char},
18 error::StringConversionFailed,
19};
20
21use super::{
22 glyph::{
23 Elevated, Glyph, GlyphShape, Unrecognised, elevate, glyph_from_name, glyph_of_char,
24 is_allowed_in_symex,
25 },
26 parser::helpers,
27 state::NumeralMode,
28};
29
30#[cfg(test)]
31mod input_file_tests;
32mod lower;
33#[cfg(test)]
34mod tests;
35
36type Span = Range<usize>;
37
38pub(crate) const DOT_CHAR: char = '·';
39pub(crate) const DOT_STR: &str = "·";
40
41#[derive(Debug, PartialEq, Eq, Clone)]
42pub(crate) struct NumericLiteral {
43 digits: String,
45
46 has_trailing_dot: bool,
57}
58
59impl Display for NumericLiteral {
60 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
61 f.write_str(self.digits.as_str())?;
62 if self.has_trailing_dot {
63 f.write_char(DOT_CHAR)?;
64 }
65 Ok(())
66 }
67}
68
69impl NumericLiteral {
70 pub(crate) fn make_num(
71 &self,
72 mode: NumeralMode,
73 ) -> Result<Unsigned36Bit, StringConversionFailed> {
74 helpers::make_num(self.digits.as_str(), self.has_trailing_dot, mode)
75 }
76
77 pub(crate) fn append_digits_of_literal(&mut self, other: &NumericLiteral) {
78 assert!(!other.has_trailing_dot);
79 self.digits.push_str(&other.digits);
80 }
81
82 pub(crate) fn has_trailing_dot(&self) -> bool {
83 self.has_trailing_dot
84 }
85
86 pub(crate) fn take_digits(self) -> String {
87 self.digits
88 }
89}
90
91#[derive(Debug, PartialEq, Eq, Clone)]
94pub(crate) enum ErrorTokenKind {
95 Tab,
99 UnrecognisedGlyph(Unrecognised),
100}
101
102impl Display for ErrorTokenKind {
103 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
104 match self {
105 ErrorTokenKind::Tab => {
106 const LONG_MSG: &str = concat!(
107 "Please do not use the TAB character. ",
108 "The differences between the M4 assembler's interpretation of tab and its interpreation of the space ",
109 "characer are complex, and these are not fully implemented. If you want to ",
110 "prevent two adjacent symexes being joined together, please use parentheses ",
111 "or an explicit '+' operation instead. That is, use (A)(B) or A+B instead of A<tab>B. ",
112 "If you intended to simply use TAB to produce some particular code layout, please ",
113 "use spaces instead.",
114 );
115 f.write_str(LONG_MSG)
116 }
117 ErrorTokenKind::UnrecognisedGlyph(e) => write!(f, "{e}"),
118 }
119 }
120}
121
122#[derive(Debug, PartialEq, Eq, Clone)]
124pub(crate) enum Token {
125 Error(ErrorTokenKind),
128 LeftBrace(Script),
129 RightBrace(Script),
130 Newline,
131 Tab,
132
133 LeftParen(Script),
136
137 RightParen(Script),
140
141 Hold,
149 NotHold, Arrow(Script),
151 Hand(Script),
152 Hash(Script),
153 Equals(Script),
154
155 Asterisk(Script),
163
164 Pipe(Script),
165 DoublePipe(Script),
166 ProperSuperset(Script),
167 SubsetOf(Script),
168 IdenticalTo(Script),
169 Tilde(Script),
170 LessThan(Script),
171 GreaterThan(Script),
172 Query(Script), Intersection(Script),
174 Union(Script),
175
176 Solidus(Script),
179
180 Plus(Script),
182 Minus(Script),
183 Times(Script),
184 LogicalOr(Script),
185 LogicalAnd(Script),
186
187 Digits(Script, NumericLiteral),
189
190 BitPosition(Script, String, String),
192
193 SymexSyllable(Script, String),
203
204 Dot(Script),
216 Comma(Script),
217}
218
219impl Display for Token {
220 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
221 let mut write_elevated = |script: &Script, s: &str| -> std::fmt::Result {
222 let el = elevate(*script, s);
223 write!(f, "{el}")
224 };
225
226 match self {
227 Token::Error(msg) => write!(f, "(error: {msg})"),
228 Token::LeftBrace(script) => write_elevated(script, "{"),
229 Token::RightBrace(script) => write_elevated(script, "}"),
230 Token::Newline => f.write_char('\n'),
231 Token::Tab => f.write_char('\t'),
232 Token::LeftParen(script) => write_elevated(script, "("),
233 Token::RightParen(script) => write_elevated(script, ")"),
234 Token::Hold => f.write_char('h'),
235 Token::NotHold => f.write_char('ℏ'),
236 Token::Arrow(script) => write_elevated(script, "->"),
237 Token::Hand(script) => write_elevated(script, "☛"),
238 Token::Asterisk(script) => write_elevated(script, "*"),
239 Token::Dot(script) => write_elevated(script, DOT_STR),
240 Token::Hash(script) => write_elevated(script, "#"),
241 Token::Equals(script) => write_elevated(script, "="),
242 Token::Pipe(script) => write_elevated(script, "|"),
243 Token::DoublePipe(script) => write_elevated(script, "‖"), Token::ProperSuperset(script) => write_elevated(script, "⊃"), Token::SubsetOf(script) => write_elevated(script, "⊂"), Token::IdenticalTo(script) => write_elevated(script, "≡"),
247 Token::Tilde(script) => write_elevated(script, "~"),
248 Token::LessThan(script) => write_elevated(script, "<"),
249 Token::GreaterThan(script) => write_elevated(script, ">"),
250 Token::Query(script) => write_elevated(script, "?"),
251 Token::Intersection(script) => write_elevated(script, "∩"),
252 Token::Union(script) => write_elevated(script, "∪"),
253 Token::Solidus(script) => write_elevated(script, "/"),
254 Token::Plus(script) => write_elevated(script, "+"),
255 Token::Minus(script) => write_elevated(script, "-"),
256 Token::Times(script) => write_elevated(script, "×"),
257 Token::LogicalOr(script) => write_elevated(script, "∨"),
258 Token::LogicalAnd(script) => write_elevated(script, "∧"),
259 Token::Digits(script, numeric_literal) => {
260 write!(f, "{}", elevate(*script, numeric_literal.to_string()))
261 }
262 Token::BitPosition(script, quarter, bit) => {
263 let q_string = elevate(*script, quarter.to_string());
264 let bit_string = elevate(*script, bit.to_string());
265 let dotname = match script {
266 Script::Normal => "@dot@",
267 Script::Sub => "@sub_dot@",
268 Script::Super => "@sup_dot@",
269 };
270 write!(f, "{q_string}{dotname}{bit_string}")
271 }
272 Token::SymexSyllable(script, name) => {
273 #[allow(clippy::unnecessary_wraps)]
274 fn nochange(ch: char) -> Result<char, ()> {
275 Ok(ch)
276 }
277 fn convert_to_sup(ch: char) -> Result<char, ()> {
278 superscript_char(ch).map_err(|_| ())
279 }
280 fn convert_to_sub(ch: char) -> Result<char, ()> {
281 subscript_char(ch).map_err(|_| ())
282 }
283 type Transformer = fn(char) -> Result<char, ()>;
284 let (prefix, transform): (&'static str, Transformer) = match script {
285 Script::Super => ("super_", convert_to_sup),
286 Script::Normal => ("", nochange),
287 Script::Sub => ("sub_", convert_to_sub),
288 };
289 for ch in name.chars() {
290 match transform(ch) {
291 Ok(sup_ch) => f.write_char(sup_ch),
292 Err(()) => match ch {
293 'α' => write!(f, "@{prefix}alpha@"),
294 'β' => write!(f, "@{prefix}beta@"),
295 'γ' => write!(f, "@{prefix}gamma@"),
296 'Δ' => write!(f, "@{prefix}delta@"),
297 'ε' => write!(f, "@{prefix}eps@"),
298 'λ' => write!(f, "@{prefix}lambda@"),
299 _ => write!(f, "@{prefix}{ch}@"),
300 },
301 }?;
302 }
303 Ok(())
304 }
305 Token::Comma(script) => write_elevated(script, ","),
306 }
307 }
308}
309
310#[derive(Debug, Clone)]
312struct GlyphRecognizer<'a> {
313 it: CharIndices<'a>,
314 pos: usize,
315 glyph_start: usize,
316}
317
318impl<'a> GlyphRecognizer<'a> {
319 fn new(input: &'a str) -> GlyphRecognizer<'a> {
320 Self {
321 it: input.char_indices(),
322 pos: 0,
323 glyph_start: 0,
324 }
325 }
326
327 fn get_next_char(&mut self) -> Option<char> {
328 match self.it.next() {
329 None => None,
330 Some((i, ch)) => {
331 self.pos = i;
332 Some(ch)
333 }
334 }
335 }
336
337 fn span(&self) -> Span {
338 self.glyph_start..(self.it.offset())
339 }
340
341 fn next_named_glyph(&mut self) -> Option<Result<Elevated<&'static Glyph>, Unrecognised>> {
342 let mut name: String = String::with_capacity(10);
343 let mut got_anything = false;
344 while let Some(ch) = self.get_next_char() {
345 got_anything = true;
346 if ch == '@' {
347 break;
348 }
349 name.push(ch);
350 }
351
352 if got_anything {
356 Some(match glyph_from_name(name.as_str()) {
357 Some(g) => Ok(g),
358 None => Err(Unrecognised::UnrecognisedGlyph(name)),
359 })
360 } else {
361 None
362 }
363 }
364}
365
366impl Iterator for GlyphRecognizer<'_> {
367 type Item = Result<Elevated<&'static Glyph>, Unrecognised>;
368
369 fn next(&mut self) -> Option<Self::Item> {
370 let ch = self.get_next_char()?;
371 self.glyph_start = self.pos;
372 match ch {
373 '@' => match self.next_named_glyph() {
374 None => {
375 Some(Err(Unrecognised::InvalidChar('@')))
379 }
380 something => something,
381 },
382 ch => Some(glyph_of_char(ch)),
383 }
384 }
385}
386
387#[test]
388fn test_glyph_recognizer_next() {
389 let mut gr = GlyphRecognizer::new("W");
390 match gr.next() {
391 Some(Ok(elev)) => {
392 assert_eq!(elev.script(), Script::Normal);
393 assert_eq!(elev.get().name, "W");
394 }
395 bad => {
396 panic!("glyph should not have been recognised as {bad:?}");
397 }
398 }
399 assert_eq!(gr.next(), None);
400}
401
402#[cfg(test)]
403fn assert_glyph(
404 got: Elevated<&'static Glyph>,
405 expected_shape: GlyphShape,
406 expected_script: Script,
407) {
408 assert_eq!(got.script(), expected_script, "wrong script for {got:?}");
409 assert_eq!(got.get().shape(), expected_shape, "wrong shape for {got:?}");
410}
411
412#[test]
413fn test_glyph_scanning() {
414 let mut scanner = GlyphRecognizer::new("hs@sub_eps@@hamb@@sup_add@@nosuch@ ");
415 assert_glyph(
417 scanner.next().expect("input").expect("in character set"),
418 GlyphShape::h,
419 Script::Normal,
420 );
421 assert_eq!(scanner.next(), Some(Err(Unrecognised::InvalidChar('s'))),);
423 assert_glyph(
424 scanner.next().expect("input").expect("in character set"),
425 GlyphShape::Epsilon,
426 Script::Sub,
427 );
428 assert_glyph(
429 scanner.next().expect("input").expect("in character set"),
430 GlyphShape::IdenticalTo,
431 Script::Normal,
432 );
433 assert_glyph(
434 scanner.next().expect("input").expect("in character set"),
435 GlyphShape::Add,
436 Script::Super,
437 );
438 assert_eq!(
439 scanner.next(),
440 Some(Err(Unrecognised::UnrecognisedGlyph("nosuch".to_string())))
441 );
442 assert_glyph(
443 scanner.next().expect("input").expect("in character set"),
444 GlyphShape::Space,
445 Script::Normal,
446 );
447 assert_eq!(scanner.next(), None);
448 assert_eq!(scanner.next(), None);
450}
451
452fn tokenise_single_glyph(g: Elevated<&'static Glyph>) -> Option<Token> {
453 let script: Script = g.script();
454
455 let make_num = |ch: char| {
456 let literal = NumericLiteral {
457 digits: {
458 let mut s = String::with_capacity(12);
459 s.push(ch);
460 s
461 },
462 has_trailing_dot: false,
463 };
464 Token::Digits(script, literal)
465 };
466 let make_symex = || -> Option<Token> {
467 let name: String = g.get().get_char(Script::Normal).iter().collect();
472 match name.chars().count() {
476 0 => {
477 panic!(
478 "incoming token '{g:?}' was assigned as part of a symex syllable, but we don't have a character for it in script {script:?}"
479 );
480 }
481 1 => (),
482 n => {
483 panic!(
484 "incoming token '{g:?}' was assigned as part of a symex syllable, but the resuting initial token body unexpectedly has more than one character (specifically, {n}): {name:?}"
485 );
486 }
487 }
488 Some(Token::SymexSyllable(script, name))
489 };
490
491 #[allow(clippy::match_same_arms)] let output: Option<Token> = match g.get().shape() {
496 GlyphShape::Space | GlyphShape::Tab => None,
497 GlyphShape::Digit0 => Some(make_num('0')),
498 GlyphShape::Digit1 => Some(make_num('1')),
499 GlyphShape::Digit2 => Some(make_num('2')),
500 GlyphShape::Digit3 => Some(make_num('3')),
501 GlyphShape::Digit4 => Some(make_num('4')),
502 GlyphShape::Digit5 => Some(make_num('5')),
503 GlyphShape::Digit6 => Some(make_num('6')),
504 GlyphShape::Digit7 => Some(make_num('7')),
505 GlyphShape::Digit8 => Some(make_num('8')),
506 GlyphShape::Digit9 => Some(make_num('9')),
507 GlyphShape::Underscore
508 | GlyphShape::Circle
509 | GlyphShape::A
510 | GlyphShape::B
511 | GlyphShape::C
512 | GlyphShape::D
513 | GlyphShape::E
514 | GlyphShape::F
515 | GlyphShape::G
516 | GlyphShape::H
517 | GlyphShape::I
518 | GlyphShape::J
519 | GlyphShape::K
520 | GlyphShape::L
521 | GlyphShape::M
522 | GlyphShape::N
523 | GlyphShape::O
524 | GlyphShape::P
525 | GlyphShape::Q
526 | GlyphShape::R
527 | GlyphShape::S
528 | GlyphShape::T
529 | GlyphShape::U
530 | GlyphShape::V
531 | GlyphShape::W
532 | GlyphShape::X
533 | GlyphShape::Y
534 | GlyphShape::Z => make_symex(),
535 GlyphShape::LeftParen => Some(Token::LeftParen(script)),
536 GlyphShape::RightParen => Some(Token::RightParen(script)),
537 GlyphShape::Add => Some(Token::Plus(script)),
538 GlyphShape::Minus => Some(Token::Minus(script)),
539 GlyphShape::Comma => Some(Token::Comma(script)),
540 GlyphShape::Dot => Some(Token::Dot(script)),
541 GlyphShape::Backspace => unimplemented!("compound characters are not yet supported"),
542 GlyphShape::Hand => Some(Token::Hand(script)),
543 GlyphShape::Sigma => {
544 todo!("Sigma (which is a symex terminator) does not yet have a token")
545 }
546 GlyphShape::Pipe => Some(Token::Pipe(script)),
547 GlyphShape::DoublePipe => Some(Token::DoublePipe(script)),
548 GlyphShape::Solidus => Some(Token::Solidus(script)),
549 GlyphShape::Times => Some(Token::Times(script)),
550 GlyphShape::Hash => Some(Token::Hash(script)),
551 GlyphShape::Arrow => Some(Token::Arrow(script)),
552 GlyphShape::LessThan => Some(Token::LessThan(script)),
553 GlyphShape::GreaterThan => Some(Token::GreaterThan(script)),
554 GlyphShape::Overbar | GlyphShape::Square | GlyphShape::n => make_symex(),
555 GlyphShape::SubsetOf => Some(Token::SubsetOf(script)),
556 GlyphShape::Or => Some(Token::LogicalOr(script)),
557 GlyphShape::q
558 | GlyphShape::Gamma
559 | GlyphShape::t
560 | GlyphShape::w
561 | GlyphShape::x
562 | GlyphShape::i
563 | GlyphShape::y
564 | GlyphShape::z => make_symex(),
565 GlyphShape::Query => Some(Token::Query(script)),
566 GlyphShape::Union => Some(Token::Union(script)),
567 GlyphShape::Intersection => Some(Token::Intersection(script)),
568 GlyphShape::j | GlyphShape::k => make_symex(),
569 GlyphShape::Alpha => make_symex(),
570 GlyphShape::Delta => make_symex(),
571 GlyphShape::p => make_symex(),
572 GlyphShape::Epsilon => make_symex(),
573 GlyphShape::h => Some(match script {
574 Script::Super | Script::Sub => unimplemented!(),
577 Script::Normal => Token::Hold,
578 }),
579 GlyphShape::SupersetOf => Some(Token::ProperSuperset(script)),
581 GlyphShape::Beta => make_symex(),
582 GlyphShape::And => Some(Token::LogicalAnd(script)),
583 GlyphShape::Lambda => make_symex(),
584 GlyphShape::Tilde => Some(Token::Tilde(script)),
585 GlyphShape::LeftBrace => Some(Token::LeftBrace(script)),
586 GlyphShape::RightBrace => Some(Token::RightBrace(script)),
587 GlyphShape::IdenticalTo => Some(Token::IdenticalTo(script)),
588 GlyphShape::Equals => Some(Token::Equals(script)),
589 GlyphShape::Apostrophe => make_symex(),
590 GlyphShape::Asterisk => Some(Token::Asterisk(script)),
591 };
592 if let Some(t) = output.as_ref() {
593 if matches!(t, Token::SymexSyllable(_, _)) {
594 assert!(
595 is_allowed_in_symex(g.get().shape),
596 "attempted to make a symex with disallowed glyph shape {g:?}"
597 );
598 } else if matches!(t, Token::Digits(_, _) | Token::Dot(_)) {
599 assert!(
605 is_allowed_in_symex(g.get().shape),
606 "all glyphs allowed in numeric literals are also allowed in symexes, but this went wrong for {g:?}"
607 );
608 } else if g.get().shape == GlyphShape::Space {
609 } else {
611 assert!(
612 !is_allowed_in_symex(g.get().shape),
613 "glyph shape {g:?} is allowed in a symex but the scanner didn't recognise it that way"
614 );
615 }
616 }
617 output
618}
619
620#[derive(Debug, PartialEq, Eq)]
621enum TokenMergeResult {
622 Merged(Token, Span),
623 Failed {
624 current: Token,
625 current_span: Span,
626 incoming: Token,
627 incoming_span: Span,
628 },
629}
630
631fn merge_tokens(current: (Token, Span), incoming: (Token, Span)) -> TokenMergeResult {
632 let ((current, current_span), (incoming, incoming_span)) = (current, incoming);
635 if matches!(
636 (¤t, &incoming),
637 (&Token::Error(_), _) | (_, &Token::Error(_))
638 ) {
639 return TokenMergeResult::Failed {
640 current,
641 current_span,
642 incoming,
643 incoming_span,
644 };
645 }
646
647 let merged_span = current_span.start..incoming_span.end;
648 match current {
649 Token::Minus(incoming_script)
650 if incoming == Token::GreaterThan(incoming_script)
651 && incoming_script == Script::Normal =>
652 {
653 TokenMergeResult::Merged(Token::Arrow(Script::Normal), merged_span)
654 }
655 Token::SymexSyllable(existing_script, mut existing_name) => match incoming {
656 Token::Hold if existing_script == Script::Normal => {
657 if existing_name == "\u{0305}" {
659 TokenMergeResult::Merged(Token::NotHold, merged_span)
660 } else {
661 TokenMergeResult::Failed {
662 current: Token::SymexSyllable(existing_script, existing_name),
663 current_span,
664 incoming: Token::Hold,
665 incoming_span,
666 }
667 }
668 }
669 Token::SymexSyllable(incoming_script, incoming_name)
670 if existing_script == incoming_script =>
671 {
672 existing_name.push_str(&incoming_name);
673 TokenMergeResult::Merged(
674 Token::SymexSyllable(existing_script, existing_name),
675 merged_span,
676 )
677 }
678 Token::Digits(incoming_script, literal) if existing_script == incoming_script => {
679 existing_name.push_str(&literal.digits);
680 if literal.has_trailing_dot {
681 existing_name.push(DOT_CHAR);
682 }
683 TokenMergeResult::Merged(
684 Token::SymexSyllable(existing_script, existing_name),
685 merged_span,
686 )
687 }
688 other => TokenMergeResult::Failed {
689 current: Token::SymexSyllable(existing_script, existing_name),
690 current_span,
691 incoming: other,
692 incoming_span,
693 },
694 },
695 Token::Digits(existing_script, mut existing_literal) => {
696 if existing_literal.has_trailing_dot {
697 match incoming {
701 Token::Digits(
702 Script::Sub,
703 NumericLiteral {
704 digits: incoming_digit,
705 has_trailing_dot: false,
706 },
707 ) if existing_script == Script::Sub => TokenMergeResult::Merged(
708 Token::BitPosition(
709 existing_script,
710 existing_literal.digits,
711 incoming_digit,
712 ),
713 merged_span,
714 ),
715 other => TokenMergeResult::Failed {
717 current: Token::Digits(existing_script, existing_literal),
718 current_span,
719 incoming: other,
720 incoming_span,
721 },
722 }
723 } else {
724 match incoming {
725 Token::Digits(incoming_script, incoming_name)
726 if existing_script == incoming_script =>
727 {
728 existing_literal.append_digits_of_literal(&incoming_name);
729 TokenMergeResult::Merged(
730 Token::Digits(existing_script, existing_literal),
731 merged_span,
732 )
733 }
734 Token::Dot(right_script)
735 if existing_script == right_script
736 && !existing_literal.has_trailing_dot =>
737 {
738 existing_literal.has_trailing_dot = true;
739 TokenMergeResult::Merged(
740 Token::Digits(existing_script, existing_literal),
741 merged_span,
742 )
743 }
744 Token::SymexSyllable(incoming_script, sym)
745 if existing_script == incoming_script =>
746 {
747 let mut existing_name: String = existing_literal.digits;
748 existing_name.push_str(&sym);
749 TokenMergeResult::Merged(
750 Token::SymexSyllable(existing_script, existing_name),
751 merged_span,
752 )
753 }
754 other => TokenMergeResult::Failed {
755 current: Token::Digits(existing_script, existing_literal),
756 current_span,
757 incoming: other,
758 incoming_span,
759 },
760 }
761 }
762 }
763 Token::BitPosition(existing_script, existing_quarter, mut existing_bit) => match incoming {
764 Token::Digits(
765 incoming_script,
766 NumericLiteral {
767 digits: incoming_digit,
768 has_trailing_dot: false,
769 },
770 ) if existing_script == incoming_script => {
771 existing_bit.push_str(incoming_digit.as_str());
772 TokenMergeResult::Merged(
773 Token::BitPosition(existing_script, existing_quarter, existing_bit),
774 merged_span,
775 )
776 }
777 Token::Dot(incoming_script) if existing_script == incoming_script => {
778 let name = format!("{existing_quarter}\u{00B7}{existing_bit}\u{00B7}");
779 TokenMergeResult::Merged(Token::SymexSyllable(Script::Sub, name), merged_span)
780 }
781 Token::SymexSyllable(incoming_script, symbol) if existing_script == incoming_script => {
782 let name = format!("{existing_quarter}\u{00B7}{existing_bit}{symbol}");
783 TokenMergeResult::Merged(Token::SymexSyllable(Script::Sub, name), merged_span)
784 }
785 other => TokenMergeResult::Failed {
786 current: Token::BitPosition(existing_script, existing_quarter, existing_bit),
787 current_span,
788 incoming: other,
789 incoming_span,
790 },
791 },
792 existing => TokenMergeResult::Failed {
793 current: existing,
794 current_span,
795 incoming,
796 incoming_span,
797 },
798 }
799}
800
801#[derive(Debug, Clone)]
804struct GlyphTokenizer<'a> {
805 current: Option<(Token, Span)>,
806 inner: GlyphRecognizer<'a>,
807}
808
809impl<'a> GlyphTokenizer<'a> {
810 fn new(input: &'a str) -> GlyphTokenizer<'a> {
811 GlyphTokenizer {
812 current: None,
813 inner: GlyphRecognizer::new(input),
814 }
815 }
816
817 fn get_next_spanned_token(&mut self) -> Option<(Token, Span)> {
818 loop {
819 let maybe_spanned_new_token: Option<(Token, Span)> = match self.inner.next() {
820 None => None,
821 Some(Err(Unrecognised::InvalidChar('ℏ'))) => {
822 return Some((Token::NotHold, self.inner.span()));
833 }
834 Some(Err(e)) => {
835 let error_token = Token::Error(ErrorTokenKind::UnrecognisedGlyph(e));
836 Some((error_token, self.inner.span()))
837 }
838 Some(Ok(g)) => {
839 if matches!(g.get().shape(), GlyphShape::Space | GlyphShape::Tab) {
840 match self.current.take() {
841 Some(r) => {
842 return Some(r);
843 }
844 None => {
845 continue;
846 }
847 }
848 }
849 let tok_start = self.inner.span().start;
850 match tokenise_single_glyph(g) {
851 Some(token) => {
852 let span = tok_start..self.inner.span().end;
853 Some((token, span))
854 }
855 None => {
856 unimplemented!("unable) to convert glyph '{g:?}' to a token")
857 }
858 }
859 }
860 };
861 if let Some((newtoken, newtoken_span)) = maybe_spanned_new_token {
862 if let Some((current, current_span)) = self.current.take() {
863 match merge_tokens((current, current_span), (newtoken, newtoken_span)) {
864 TokenMergeResult::Merged(merged, merged_span) => {
865 self.current = Some((merged, merged_span));
866 }
867 TokenMergeResult::Failed {
868 current,
869 current_span,
870 incoming: newtoken,
871 incoming_span: newtoken_span,
872 } => {
873 let result = (current, current_span);
874 self.current = Some((newtoken, newtoken_span));
875 return Some(result);
876 }
877 }
878 } else {
879 self.current = Some((newtoken, newtoken_span));
881 }
882 } else {
883 return self.current.take();
885 }
886 }
887 }
888}
889
890#[test]
891fn test_glyph_tokenizer_simple_multi_token() {
892 let mut lex = GlyphTokenizer::new("hx");
893 assert_eq!(lex.get_next_spanned_token(), Some((Token::Hold, 0..1)));
894 assert_eq!(
895 lex.get_next_spanned_token(),
896 Some((Token::SymexSyllable(Script::Normal, "x".to_string()), 1..2))
897 );
898 assert_eq!(lex.get_next_spanned_token(), None);
899}
900
901#[test]
902fn test_glyph_tokenizer_glyph_names() {
903 let mut lex = GlyphTokenizer::new("@sup_eps@");
904 assert_eq!(
905 lex.get_next_spanned_token(),
906 Some((Token::SymexSyllable(Script::Super, "ε".to_string()), 0..9))
907 );
908 assert_eq!(lex.get_next_spanned_token(), None);
909}
910
911#[test]
912fn test_glyph_tokenizer_multi_glyph_token() {
913 let input = "@sup_eps@ᵂ";
916 let mut lex = GlyphTokenizer::new(input);
917 assert_eq!(
918 lex.get_next_spanned_token(),
919 Some((Token::SymexSyllable(Script::Super, "εW".to_string()), 0..12))
920 );
921 assert_eq!(lex.get_next_spanned_token(), None);
922}
923
924#[test]
925fn test_glyph_tokenizer_script_change_breaks_tokens() {
926 let mut lex = GlyphTokenizer::new("@sup_eps@W");
930 assert_eq!(
931 lex.get_next_spanned_token(),
932 Some((Token::SymexSyllable(Script::Super, "ε".to_string()), 0..9))
933 );
934 assert_eq!(
935 lex.get_next_spanned_token(),
936 Some((Token::SymexSyllable(Script::Normal, "W".to_string()), 9..10))
937 );
938 assert_eq!(lex.get_next_spanned_token(), None);
939}
940
941#[test]
942fn test_glyph_tokenizer_space_breaks_tokens() {
943 let mut lex = GlyphTokenizer::new("W Q");
947 assert_eq!(
948 lex.get_next_spanned_token(),
949 Some((Token::SymexSyllable(Script::Normal, "W".to_string()), 0..1))
950 );
951 assert_eq!(
952 lex.get_next_spanned_token(),
953 Some((Token::SymexSyllable(Script::Normal, "Q".to_string()), 2..3))
954 );
955 assert_eq!(lex.get_next_spanned_token(), None);
956}
957
958#[test]
959fn test_glyph_tokenizer_question_mark() {
960 let mut lex = GlyphTokenizer::new("?");
961 assert_eq!(
962 lex.get_next_spanned_token(),
963 Some((Token::Query(Script::Normal), 0..1))
964 );
965}
966
967#[derive(Debug, Clone)]
976pub(crate) struct Lexer<'a> {
977 lower: lower::LowerLexer<'a>,
978 upper: Option<GlyphTokenizer<'a>>,
979 upper_span: Option<Span>,
980}
981
982impl<'a> Lexer<'a> {
983 pub(crate) fn new(input: &'a str) -> Lexer<'a> {
984 Lexer {
985 lower: lower::LowerLexer::new(input),
986 upper: None,
987 upper_span: None,
988 }
989 }
990
991 fn adjust_span(&self, span: Range<usize>) -> Range<usize> {
992 match self.upper_span.as_ref() {
993 None => span,
994 Some(upper_span) => {
995 let offset = span.start;
996 (upper_span.start + offset)..(upper_span.end + offset)
997 }
998 }
999 }
1000
1001 pub(crate) fn span(&self) -> Range<usize> {
1002 self.adjust_span(self.lower.span())
1003 }
1004
1005 pub(crate) fn spanned(&self) -> SpannedIter<'a> {
1006 let lexer: Lexer<'a> = self.clone();
1007 SpannedIter::new(lexer)
1008 }
1009
1010 fn get_next(&mut self) -> Option<Token> {
1011 use lower::Lexeme;
1012 if let Some(upper_lexer) = self.upper.as_mut()
1014 && let Some((r, span)) = upper_lexer.get_next_spanned_token()
1015 {
1016 self.upper_span = Some(span);
1017 return Some(r);
1018 }
1019
1020 self.upper = None;
1022 self.upper_span = None;
1023 match self.lower.next() {
1024 Lexeme::EndOfInput => None,
1025 Lexeme::Tok(tok) => Some(tok),
1026 Lexeme::Text(text) => {
1027 let upper = GlyphTokenizer::new(text);
1028 self.upper = Some(upper);
1029 match self
1030 .upper
1031 .as_mut()
1032 .expect("the option cannot be empty, we just filled it")
1033 .get_next_spanned_token()
1034 {
1035 Some((r, span)) => {
1036 self.upper_span = Some(span);
1037 Some(r)
1038 }
1039 None => None,
1040 }
1041 }
1042 Lexeme::Err(e) => Some(Token::Error(ErrorTokenKind::UnrecognisedGlyph(e))),
1043 }
1044 }
1045}
1046
1047impl Iterator for Lexer<'_> {
1048 type Item = Token;
1049
1050 fn next(&mut self) -> Option<Token> {
1051 self.get_next()
1052 }
1053}
1054
1055#[derive(Debug, Clone)]
1056pub(crate) struct SpannedIter<'a> {
1057 lexer: Lexer<'a>,
1058}
1059
1060impl<'a> SpannedIter<'a> {
1061 pub(crate) fn new(lexer: Lexer<'a>) -> SpannedIter<'a> {
1062 SpannedIter { lexer }
1063 }
1064}
1065
1066impl Iterator for SpannedIter<'_> {
1067 type Item = (Token, Span);
1068
1069 fn next(&mut self) -> Option<Self::Item> {
1070 let token = self.lexer.next();
1071 token.map(|t| (t, self.lexer.span()))
1072 }
1073}