1use std::ops::Range;
2
3use diagnostics::make_unclosed_delims_error;
4use rustc_ast::ast::{self, AttrStyle};
5use rustc_ast::token::{self, CommentKind, Delimiter, IdentIsRaw, Token, TokenKind};
6use rustc_ast::tokenstream::TokenStream;
7use rustc_ast::util::unicode::{TEXT_FLOW_CONTROL_CHARS, contains_text_flow_control_chars};
8use rustc_errors::codes::*;
9use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey};
10use rustc_lexer::{
11 Base, Cursor, DocStyle, FrontmatterAllowed, LiteralKind, RawStrError, is_whitespace,
12};
13use rustc_literal_escaper::{EscapeError, Mode, unescape_mixed, unescape_unicode};
14use rustc_session::lint::BuiltinLintDiag;
15use rustc_session::lint::builtin::{
16 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
17 TEXT_DIRECTION_CODEPOINT_IN_COMMENT, TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
18};
19use rustc_session::parse::ParseSess;
20use rustc_span::{BytePos, Pos, Span, Symbol, sym};
21use tracing::debug;
22
23use crate::errors;
24use crate::lexer::diagnostics::TokenTreeDiagInfo;
25use crate::lexer::unicode_chars::UNICODE_ARRAY;
26
27mod diagnostics;
28mod tokentrees;
29mod unescape_error_reporting;
30mod unicode_chars;
31
32use unescape_error_reporting::{emit_unescape_error, escaped_char};
33
34#[cfg(target_pointer_width = "64")]
39rustc_data_structures::static_assert_size!(rustc_lexer::Token, 12);
40
41#[derive(Clone, Debug)]
42pub(crate) struct UnmatchedDelim {
43 pub found_delim: Option<Delimiter>,
44 pub found_span: Span,
45 pub unclosed_span: Option<Span>,
46 pub candidate_span: Option<Span>,
47}
48
49pub(crate) fn lex_token_trees<'psess, 'src>(
50 psess: &'psess ParseSess,
51 mut src: &'src str,
52 mut start_pos: BytePos,
53 override_span: Option<Span>,
54) -> Result<TokenStream, Vec<Diag<'psess>>> {
55 if let Some(shebang_len) = rustc_lexer::strip_shebang(src) {
57 src = &src[shebang_len..];
58 start_pos = start_pos + BytePos::from_usize(shebang_len);
59 }
60
61 let cursor = Cursor::new(src, FrontmatterAllowed::Yes);
62 let mut lexer = Lexer {
63 psess,
64 start_pos,
65 pos: start_pos,
66 src,
67 cursor,
68 override_span,
69 nbsp_is_whitespace: false,
70 last_lifetime: None,
71 token: Token::dummy(),
72 diag_info: TokenTreeDiagInfo::default(),
73 };
74 let res = lexer.lex_token_trees(false);
75
76 let mut unmatched_delims: Vec<_> = lexer
77 .diag_info
78 .unmatched_delims
79 .into_iter()
80 .filter_map(|unmatched_delim| make_unclosed_delims_error(unmatched_delim, psess))
81 .collect();
82
83 match res {
84 Ok((_open_spacing, stream)) => {
85 if unmatched_delims.is_empty() {
86 Ok(stream)
87 } else {
88 Err(unmatched_delims)
90 }
91 }
92 Err(errs) => {
93 unmatched_delims.extend(errs);
96 Err(unmatched_delims)
97 }
98 }
99}
100
101struct Lexer<'psess, 'src> {
102 psess: &'psess ParseSess,
103 start_pos: BytePos,
105 pos: BytePos,
107 src: &'src str,
109 cursor: Cursor<'src>,
111 override_span: Option<Span>,
112 nbsp_is_whitespace: bool,
116
117 last_lifetime: Option<Span>,
120
121 token: Token,
123
124 diag_info: TokenTreeDiagInfo,
125}
126
127impl<'psess, 'src> Lexer<'psess, 'src> {
128 fn dcx(&self) -> DiagCtxtHandle<'psess> {
129 self.psess.dcx()
130 }
131
132 fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
133 self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
134 }
135
136 fn next_token_from_cursor(&mut self) -> (Token, bool) {
139 let mut preceded_by_whitespace = false;
140 let mut swallow_next_invalid = 0;
141 loop {
143 let str_before = self.cursor.as_str();
144 let token = self.cursor.advance_token();
145 let start = self.pos;
146 self.pos = self.pos + BytePos(token.len);
147
148 debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
149
150 if let rustc_lexer::TokenKind::Semi
151 | rustc_lexer::TokenKind::LineComment { .. }
152 | rustc_lexer::TokenKind::BlockComment { .. }
153 | rustc_lexer::TokenKind::CloseParen
154 | rustc_lexer::TokenKind::CloseBrace
155 | rustc_lexer::TokenKind::CloseBracket = token.kind
156 {
157 self.last_lifetime = None;
160 }
161
162 let kind = match token.kind {
166 rustc_lexer::TokenKind::LineComment { doc_style } => {
167 let Some(doc_style) = doc_style else {
169 self.lint_unicode_text_flow(start);
170 preceded_by_whitespace = true;
171 continue;
172 };
173
174 let content_start = start + BytePos(3);
176 let content = self.str_from(content_start);
177 self.lint_doc_comment_unicode_text_flow(start, content);
178 self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
179 }
180 rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
181 if !terminated {
182 self.report_unterminated_block_comment(start, doc_style);
183 }
184
185 let Some(doc_style) = doc_style else {
187 self.lint_unicode_text_flow(start);
188 preceded_by_whitespace = true;
189 continue;
190 };
191
192 let content_start = start + BytePos(3);
195 let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
196 let content = self.str_from_to(content_start, content_end);
197 self.lint_doc_comment_unicode_text_flow(start, content);
198 self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
199 }
200 rustc_lexer::TokenKind::Frontmatter { has_invalid_preceding_whitespace, invalid_infostring } => {
201 self.validate_frontmatter(start, has_invalid_preceding_whitespace, invalid_infostring);
202 preceded_by_whitespace = true;
203 continue;
204 }
205 rustc_lexer::TokenKind::Whitespace => {
206 preceded_by_whitespace = true;
207 continue;
208 }
209 rustc_lexer::TokenKind::Ident => self.ident(start),
210 rustc_lexer::TokenKind::RawIdent => {
211 let sym = nfc_normalize(self.str_from(start + BytePos(2)));
212 let span = self.mk_sp(start, self.pos);
213 self.psess.symbol_gallery.insert(sym, span);
214 if !sym.can_be_raw() {
215 self.dcx().emit_err(errors::CannotBeRawIdent { span, ident: sym });
216 }
217 self.psess.raw_identifier_spans.push(span);
218 token::Ident(sym, IdentIsRaw::Yes)
219 }
220 rustc_lexer::TokenKind::UnknownPrefix => {
221 self.report_unknown_prefix(start);
222 self.ident(start)
223 }
224 rustc_lexer::TokenKind::UnknownPrefixLifetime => {
225 self.report_unknown_prefix(start);
226 let lifetime_name = self.str_from(start);
230 self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
231 let ident = Symbol::intern(lifetime_name);
232 token::Lifetime(ident, IdentIsRaw::No)
233 }
234 rustc_lexer::TokenKind::InvalidIdent
235 if !UNICODE_ARRAY.iter().any(|&(c, _, _)| {
238 let sym = self.str_from(start);
239 sym.chars().count() == 1 && c == sym.chars().next().unwrap()
240 }) =>
241 {
242 let sym = nfc_normalize(self.str_from(start));
243 let span = self.mk_sp(start, self.pos);
244 self.psess
245 .bad_unicode_identifiers
246 .borrow_mut()
247 .entry(sym)
248 .or_default()
249 .push(span);
250 token::Ident(sym, IdentIsRaw::No)
251 }
252 rustc_lexer::TokenKind::Literal {
255 kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
256 suffix_start: _,
257 } if !self.mk_sp(start, self.pos).edition().at_least_rust_2021() => {
258 let prefix_len = match kind {
259 LiteralKind::CStr { .. } => 1,
260 LiteralKind::RawCStr { .. } => 2,
261 _ => unreachable!(),
262 };
263
264 let lit_start = start + BytePos(prefix_len);
267 self.pos = lit_start;
268 self.cursor = Cursor::new(&str_before[prefix_len as usize..], FrontmatterAllowed::No);
269 self.report_unknown_prefix(start);
270 let prefix_span = self.mk_sp(start, lit_start);
271 return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
272 }
273 rustc_lexer::TokenKind::GuardedStrPrefix => {
274 self.maybe_report_guarded_str(start, str_before)
275 }
276 rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
277 let suffix_start = start + BytePos(suffix_start);
278 let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
279 let suffix = if suffix_start < self.pos {
280 let string = self.str_from(suffix_start);
281 if string == "_" {
282 self.dcx().emit_err(errors::UnderscoreLiteralSuffix {
283 span: self.mk_sp(suffix_start, self.pos),
284 });
285 None
286 } else {
287 Some(Symbol::intern(string))
288 }
289 } else {
290 None
291 };
292 self.lint_literal_unicode_text_flow(symbol, kind, self.mk_sp(start, self.pos), "literal");
293 token::Literal(token::Lit { kind, symbol, suffix })
294 }
295 rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
296 let lifetime_name = self.str_from(start);
300 self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
301 if starts_with_number {
302 let span = self.mk_sp(start, self.pos);
303 self.dcx()
304 .struct_err("lifetimes cannot start with a number")
305 .with_span(span)
306 .stash(span, StashKey::LifetimeIsChar);
307 }
308 let ident = Symbol::intern(lifetime_name);
309 token::Lifetime(ident, IdentIsRaw::No)
310 }
311 rustc_lexer::TokenKind::RawLifetime => {
312 self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
313
314 let ident_start = start + BytePos(3);
315 let prefix_span = self.mk_sp(start, ident_start);
316
317 if prefix_span.at_least_rust_2021() {
318 if self.cursor.as_str().starts_with('\'') {
324 let lit_span = self.mk_sp(start, self.pos + BytePos(1));
325 let contents = self.str_from_to(start + BytePos(1), self.pos);
326 emit_unescape_error(
327 self.dcx(),
328 contents,
329 lit_span,
330 lit_span,
331 Mode::Char,
332 0..contents.len(),
333 EscapeError::MoreThanOneChar,
334 )
335 .expect("expected error");
336 }
337
338 let span = self.mk_sp(start, self.pos);
339
340 let lifetime_name_without_tick =
341 Symbol::intern(&self.str_from(ident_start));
342 if !lifetime_name_without_tick.can_be_raw() {
343 self.dcx().emit_err(
344 errors::CannotBeRawLifetime {
345 span,
346 ident: lifetime_name_without_tick
347 }
348 );
349 }
350
351 let mut lifetime_name =
353 String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
354 lifetime_name.push('\'');
355 lifetime_name += lifetime_name_without_tick.as_str();
356 let sym = Symbol::intern(&lifetime_name);
357
358 self.psess.raw_identifier_spans.push(span);
360
361 token::Lifetime(sym, IdentIsRaw::Yes)
362 } else {
363 self.psess.buffer_lint(
365 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
366 prefix_span,
367 ast::CRATE_NODE_ID,
368 BuiltinLintDiag::RawPrefix(prefix_span),
369 );
370
371 let lt_start = start + BytePos(2);
373 self.pos = lt_start;
374 self.cursor = Cursor::new(&str_before[2 as usize..], FrontmatterAllowed::No);
375
376 let lifetime_name = self.str_from(start);
377 let ident = Symbol::intern(lifetime_name);
378 token::Lifetime(ident, IdentIsRaw::No)
379 }
380 }
381 rustc_lexer::TokenKind::Semi => token::Semi,
382 rustc_lexer::TokenKind::Comma => token::Comma,
383 rustc_lexer::TokenKind::Dot => token::Dot,
384 rustc_lexer::TokenKind::OpenParen => token::OpenParen,
385 rustc_lexer::TokenKind::CloseParen => token::CloseParen,
386 rustc_lexer::TokenKind::OpenBrace => token::OpenBrace,
387 rustc_lexer::TokenKind::CloseBrace => token::CloseBrace,
388 rustc_lexer::TokenKind::OpenBracket => token::OpenBracket,
389 rustc_lexer::TokenKind::CloseBracket => token::CloseBracket,
390 rustc_lexer::TokenKind::At => token::At,
391 rustc_lexer::TokenKind::Pound => token::Pound,
392 rustc_lexer::TokenKind::Tilde => token::Tilde,
393 rustc_lexer::TokenKind::Question => token::Question,
394 rustc_lexer::TokenKind::Colon => token::Colon,
395 rustc_lexer::TokenKind::Dollar => token::Dollar,
396 rustc_lexer::TokenKind::Eq => token::Eq,
397 rustc_lexer::TokenKind::Bang => token::Bang,
398 rustc_lexer::TokenKind::Lt => token::Lt,
399 rustc_lexer::TokenKind::Gt => token::Gt,
400 rustc_lexer::TokenKind::Minus => token::Minus,
401 rustc_lexer::TokenKind::And => token::And,
402 rustc_lexer::TokenKind::Or => token::Or,
403 rustc_lexer::TokenKind::Plus => token::Plus,
404 rustc_lexer::TokenKind::Star => token::Star,
405 rustc_lexer::TokenKind::Slash => token::Slash,
406 rustc_lexer::TokenKind::Caret => token::Caret,
407 rustc_lexer::TokenKind::Percent => token::Percent,
408
409 rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
410 if swallow_next_invalid > 0 {
412 swallow_next_invalid -= 1;
413 continue;
414 }
415 let mut it = self.str_from_to_end(start).chars();
416 let c = it.next().unwrap();
417 if c == '\u{00a0}' {
418 if self.nbsp_is_whitespace {
422 preceded_by_whitespace = true;
423 continue;
424 }
425 self.nbsp_is_whitespace = true;
426 }
427 let repeats = it.take_while(|c1| *c1 == c).count();
428 let (token, sugg) =
435 unicode_chars::check_for_substitution(self, start, c, repeats + 1);
436 self.dcx().emit_err(errors::UnknownTokenStart {
437 span: self.mk_sp(start, self.pos + Pos::from_usize(repeats * c.len_utf8())),
438 escaped: escaped_char(c),
439 sugg,
440 null: if c == '\x00' { Some(errors::UnknownTokenNull) } else { None },
441 repeat: if repeats > 0 {
442 swallow_next_invalid = repeats;
443 Some(errors::UnknownTokenRepeat { repeats })
444 } else {
445 None
446 },
447 });
448
449 if let Some(token) = token {
450 token
451 } else {
452 preceded_by_whitespace = true;
453 continue;
454 }
455 }
456 rustc_lexer::TokenKind::Eof => token::Eof,
457 };
458 let span = self.mk_sp(start, self.pos);
459 return (Token::new(kind, span), preceded_by_whitespace);
460 }
461 }
462
463 fn ident(&self, start: BytePos) -> TokenKind {
464 let sym = nfc_normalize(self.str_from(start));
465 let span = self.mk_sp(start, self.pos);
466 self.psess.symbol_gallery.insert(sym, span);
467 token::Ident(sym, IdentIsRaw::No)
468 }
469
470 fn lint_unicode_text_flow(&self, start: BytePos) {
473 let content_start = start + BytePos(2);
475 let content = self.str_from(content_start);
476 if contains_text_flow_control_chars(content) {
477 let span = self.mk_sp(start, self.pos);
478 self.psess.buffer_lint(
479 TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
480 span,
481 ast::CRATE_NODE_ID,
482 BuiltinLintDiag::UnicodeTextFlow(span, content.to_string()),
483 );
484 }
485 }
486
487 fn lint_doc_comment_unicode_text_flow(&mut self, start: BytePos, content: &str) {
488 if contains_text_flow_control_chars(content) {
489 self.report_text_direction_codepoint(
490 content,
491 self.mk_sp(start, self.pos),
492 0,
493 false,
494 "doc comment",
495 );
496 }
497 }
498
499 fn lint_literal_unicode_text_flow(
500 &mut self,
501 text: Symbol,
502 lit_kind: token::LitKind,
503 span: Span,
504 label: &'static str,
505 ) {
506 if !contains_text_flow_control_chars(text.as_str()) {
507 return;
508 }
509 let (padding, point_at_inner_spans) = match lit_kind {
510 token::LitKind::Str | token::LitKind::Char => (1, true),
512 token::LitKind::CStr => (2, true),
514 token::LitKind::StrRaw(n) => (n as u32 + 2, true),
516 token::LitKind::CStrRaw(n) => (n as u32 + 3, true),
518 token::LitKind::Err(_) => return,
520 _ => (0, false),
522 };
523 self.report_text_direction_codepoint(
524 text.as_str(),
525 span,
526 padding,
527 point_at_inner_spans,
528 label,
529 );
530 }
531
532 fn report_text_direction_codepoint(
533 &self,
534 text: &str,
535 span: Span,
536 padding: u32,
537 point_at_inner_spans: bool,
538 label: &str,
539 ) {
540 let spans: Vec<_> = text
542 .char_indices()
543 .filter_map(|(i, c)| {
544 TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
545 let lo = span.lo() + BytePos(i as u32 + padding);
546 (c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
547 })
548 })
549 .collect();
550
551 let count = spans.len();
552 let labels = point_at_inner_spans.then_some(spans.clone());
553
554 self.psess.buffer_lint(
555 TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
556 span,
557 ast::CRATE_NODE_ID,
558 BuiltinLintDiag::HiddenUnicodeCodepoints {
559 label: label.to_string(),
560 count,
561 span_label: span,
562 labels,
563 escape: point_at_inner_spans && !spans.is_empty(),
564 spans,
565 },
566 );
567 }
568
569 fn validate_frontmatter(
570 &self,
571 start: BytePos,
572 has_invalid_preceding_whitespace: bool,
573 invalid_infostring: bool,
574 ) {
575 let s = self.str_from(start);
576 let real_start = s.find("---").unwrap();
577 let frontmatter_opening_pos = BytePos(real_start as u32) + start;
578 let s_new = &s[real_start..];
579 let within = s_new.trim_start_matches('-');
580 let len_opening = s_new.len() - within.len();
581
582 let frontmatter_opening_end_pos = frontmatter_opening_pos + BytePos(len_opening as u32);
583 if has_invalid_preceding_whitespace {
584 let line_start =
585 BytePos(s[..real_start].rfind("\n").map_or(0, |i| i as u32 + 1)) + start;
586 let span = self.mk_sp(line_start, frontmatter_opening_end_pos);
587 let label_span = self.mk_sp(line_start, frontmatter_opening_pos);
588 self.dcx().emit_err(errors::FrontmatterInvalidOpeningPrecedingWhitespace {
589 span,
590 note_span: label_span,
591 });
592 }
593
594 if invalid_infostring {
595 let line_end = s[real_start..].find('\n').unwrap_or(s[real_start..].len());
596 let span = self.mk_sp(
597 frontmatter_opening_end_pos,
598 frontmatter_opening_pos + BytePos(line_end as u32),
599 );
600 self.dcx().emit_err(errors::FrontmatterInvalidInfostring { span });
601 }
602
603 let last_line_start = within.rfind('\n').map_or(0, |i| i + 1);
604 let last_line = &within[last_line_start..];
605 let last_line_trimmed = last_line.trim_start_matches(is_whitespace);
606 let last_line_start_pos = frontmatter_opening_end_pos + BytePos(last_line_start as u32);
607
608 let frontmatter_span = self.mk_sp(frontmatter_opening_pos, self.pos);
609 self.psess.gated_spans.gate(sym::frontmatter, frontmatter_span);
610
611 if !last_line_trimmed.starts_with("---") {
612 let label_span = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
613 self.dcx().emit_err(errors::FrontmatterUnclosed {
614 span: frontmatter_span,
615 note_span: label_span,
616 });
617 return;
618 }
619
620 if last_line_trimmed.len() != last_line.len() {
621 let line_end = last_line_start_pos + BytePos(last_line.len() as u32);
622 let span = self.mk_sp(last_line_start_pos, line_end);
623 let whitespace_end =
624 last_line_start_pos + BytePos((last_line.len() - last_line_trimmed.len()) as u32);
625 let label_span = self.mk_sp(last_line_start_pos, whitespace_end);
626 self.dcx().emit_err(errors::FrontmatterInvalidClosingPrecedingWhitespace {
627 span,
628 note_span: label_span,
629 });
630 }
631
632 let rest = last_line_trimmed.trim_start_matches('-');
633 let len_close = last_line_trimmed.len() - rest.len();
634 if len_close != len_opening {
635 let span = self.mk_sp(frontmatter_opening_pos, self.pos);
636 let opening = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
637 let last_line_close_pos = last_line_start_pos + BytePos(len_close as u32);
638 let close = self.mk_sp(last_line_start_pos, last_line_close_pos);
639 self.dcx().emit_err(errors::FrontmatterLengthMismatch {
640 span,
641 opening,
642 close,
643 len_opening,
644 len_close,
645 });
646 }
647
648 if !rest.trim_matches(is_whitespace).is_empty() {
649 let span = self.mk_sp(last_line_start_pos, self.pos);
650 self.dcx().emit_err(errors::FrontmatterExtraCharactersAfterClose { span });
651 }
652 }
653
654 fn cook_doc_comment(
655 &self,
656 content_start: BytePos,
657 content: &str,
658 comment_kind: CommentKind,
659 doc_style: DocStyle,
660 ) -> TokenKind {
661 if content.contains('\r') {
662 for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
663 let span = self.mk_sp(
664 content_start + BytePos(idx as u32),
665 content_start + BytePos(idx as u32 + 1),
666 );
667 let block = matches!(comment_kind, CommentKind::Block);
668 self.dcx().emit_err(errors::CrDocComment { span, block });
669 }
670 }
671
672 let attr_style = match doc_style {
673 DocStyle::Outer => AttrStyle::Outer,
674 DocStyle::Inner => AttrStyle::Inner,
675 };
676
677 token::DocComment(comment_kind, attr_style, Symbol::intern(content))
678 }
679
680 fn cook_lexer_literal(
681 &self,
682 start: BytePos,
683 end: BytePos,
684 kind: rustc_lexer::LiteralKind,
685 ) -> (token::LitKind, Symbol) {
686 match kind {
687 rustc_lexer::LiteralKind::Char { terminated } => {
688 if !terminated {
689 let mut err = self
690 .dcx()
691 .struct_span_fatal(self.mk_sp(start, end), "unterminated character literal")
692 .with_code(E0762);
693 if let Some(lt_sp) = self.last_lifetime {
694 err.multipart_suggestion(
695 "if you meant to write a string literal, use double quotes",
696 vec![
697 (lt_sp, "\"".to_string()),
698 (self.mk_sp(start, start + BytePos(1)), "\"".to_string()),
699 ],
700 Applicability::MaybeIncorrect,
701 );
702 }
703 err.emit()
704 }
705 self.cook_unicode(token::Char, Mode::Char, start, end, 1, 1) }
707 rustc_lexer::LiteralKind::Byte { terminated } => {
708 if !terminated {
709 self.dcx()
710 .struct_span_fatal(
711 self.mk_sp(start + BytePos(1), end),
712 "unterminated byte constant",
713 )
714 .with_code(E0763)
715 .emit()
716 }
717 self.cook_unicode(token::Byte, Mode::Byte, start, end, 2, 1) }
719 rustc_lexer::LiteralKind::Str { terminated } => {
720 if !terminated {
721 self.dcx()
722 .struct_span_fatal(
723 self.mk_sp(start, end),
724 "unterminated double quote string",
725 )
726 .with_code(E0765)
727 .emit()
728 }
729 self.cook_unicode(token::Str, Mode::Str, start, end, 1, 1) }
731 rustc_lexer::LiteralKind::ByteStr { terminated } => {
732 if !terminated {
733 self.dcx()
734 .struct_span_fatal(
735 self.mk_sp(start + BytePos(1), end),
736 "unterminated double quote byte string",
737 )
738 .with_code(E0766)
739 .emit()
740 }
741 self.cook_unicode(token::ByteStr, Mode::ByteStr, start, end, 2, 1) }
743 rustc_lexer::LiteralKind::CStr { terminated } => {
744 if !terminated {
745 self.dcx()
746 .struct_span_fatal(
747 self.mk_sp(start + BytePos(1), end),
748 "unterminated C string",
749 )
750 .with_code(E0767)
751 .emit()
752 }
753 self.cook_mixed(token::CStr, Mode::CStr, start, end, 2, 1) }
755 rustc_lexer::LiteralKind::RawStr { n_hashes } => {
756 if let Some(n_hashes) = n_hashes {
757 let n = u32::from(n_hashes);
758 let kind = token::StrRaw(n_hashes);
759 self.cook_unicode(kind, Mode::RawStr, start, end, 2 + n, 1 + n) } else {
761 self.report_raw_str_error(start, 1);
762 }
763 }
764 rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
765 if let Some(n_hashes) = n_hashes {
766 let n = u32::from(n_hashes);
767 let kind = token::ByteStrRaw(n_hashes);
768 self.cook_unicode(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) } else {
770 self.report_raw_str_error(start, 2);
771 }
772 }
773 rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
774 if let Some(n_hashes) = n_hashes {
775 let n = u32::from(n_hashes);
776 let kind = token::CStrRaw(n_hashes);
777 self.cook_unicode(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) } else {
779 self.report_raw_str_error(start, 2);
780 }
781 }
782 rustc_lexer::LiteralKind::Int { base, empty_int } => {
783 let mut kind = token::Integer;
784 if empty_int {
785 let span = self.mk_sp(start, end);
786 let guar = self.dcx().emit_err(errors::NoDigitsLiteral { span });
787 kind = token::Err(guar);
788 } else if matches!(base, Base::Binary | Base::Octal) {
789 let base = base as u32;
790 let s = self.str_from_to(start + BytePos(2), end);
791 for (idx, c) in s.char_indices() {
792 let span = self.mk_sp(
793 start + BytePos::from_usize(2 + idx),
794 start + BytePos::from_usize(2 + idx + c.len_utf8()),
795 );
796 if c != '_' && c.to_digit(base).is_none() {
797 let guar =
798 self.dcx().emit_err(errors::InvalidDigitLiteral { span, base });
799 kind = token::Err(guar);
800 }
801 }
802 }
803 (kind, self.symbol_from_to(start, end))
804 }
805 rustc_lexer::LiteralKind::Float { base, empty_exponent } => {
806 let mut kind = token::Float;
807 if empty_exponent {
808 let span = self.mk_sp(start, self.pos);
809 let guar = self.dcx().emit_err(errors::EmptyExponentFloat { span });
810 kind = token::Err(guar);
811 }
812 let base = match base {
813 Base::Hexadecimal => Some("hexadecimal"),
814 Base::Octal => Some("octal"),
815 Base::Binary => Some("binary"),
816 _ => None,
817 };
818 if let Some(base) = base {
819 let span = self.mk_sp(start, end);
820 let guar =
821 self.dcx().emit_err(errors::FloatLiteralUnsupportedBase { span, base });
822 kind = token::Err(guar)
823 }
824 (kind, self.symbol_from_to(start, end))
825 }
826 }
827 }
828
829 #[inline]
830 fn src_index(&self, pos: BytePos) -> usize {
831 (pos - self.start_pos).to_usize()
832 }
833
834 fn str_from(&self, start: BytePos) -> &'src str {
837 self.str_from_to(start, self.pos)
838 }
839
840 fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
842 debug!("taking an ident from {:?} to {:?}", start, end);
843 Symbol::intern(self.str_from_to(start, end))
844 }
845
846 fn str_from_to(&self, start: BytePos, end: BytePos) -> &'src str {
848 &self.src[self.src_index(start)..self.src_index(end)]
849 }
850
851 fn str_from_to_end(&self, start: BytePos) -> &'src str {
853 &self.src[self.src_index(start)..]
854 }
855
856 fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
857 match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
858 Err(RawStrError::InvalidStarter { bad_char }) => {
859 self.report_non_started_raw_string(start, bad_char)
860 }
861 Err(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self
862 .report_unterminated_raw_string(start, expected, possible_terminator_offset, found),
863 Err(RawStrError::TooManyDelimiters { found }) => {
864 self.report_too_many_hashes(start, found)
865 }
866 Ok(()) => panic!("no error found for supposedly invalid raw string literal"),
867 }
868 }
869
870 fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! {
871 self.dcx()
872 .struct_span_fatal(
873 self.mk_sp(start, self.pos),
874 format!(
875 "found invalid character; only `#` is allowed in raw string delimitation: {}",
876 escaped_char(bad_char)
877 ),
878 )
879 .emit()
880 }
881
882 fn report_unterminated_raw_string(
883 &self,
884 start: BytePos,
885 n_hashes: u32,
886 possible_offset: Option<u32>,
887 found_terminators: u32,
888 ) -> ! {
889 let mut err =
890 self.dcx().struct_span_fatal(self.mk_sp(start, start), "unterminated raw string");
891 err.code(E0748);
892 err.span_label(self.mk_sp(start, start), "unterminated raw string");
893
894 if n_hashes > 0 {
895 err.note(format!(
896 "this raw string should be terminated with `\"{}`",
897 "#".repeat(n_hashes as usize)
898 ));
899 }
900
901 if let Some(possible_offset) = possible_offset {
902 let lo = start + BytePos(possible_offset);
903 let hi = lo + BytePos(found_terminators);
904 let span = self.mk_sp(lo, hi);
905 err.span_suggestion(
906 span,
907 "consider terminating the string here",
908 "#".repeat(n_hashes as usize),
909 Applicability::MaybeIncorrect,
910 );
911 }
912
913 err.emit()
914 }
915
916 fn report_unterminated_block_comment(&self, start: BytePos, doc_style: Option<DocStyle>) {
917 let msg = match doc_style {
918 Some(_) => "unterminated block doc-comment",
919 None => "unterminated block comment",
920 };
921 let last_bpos = self.pos;
922 let mut err = self.dcx().struct_span_fatal(self.mk_sp(start, last_bpos), msg);
923 err.code(E0758);
924 let mut nested_block_comment_open_idxs = vec![];
925 let mut last_nested_block_comment_idxs = None;
926 let mut content_chars = self.str_from(start).char_indices().peekable();
927
928 while let Some((idx, current_char)) = content_chars.next() {
929 match content_chars.peek() {
930 Some((_, '*')) if current_char == '/' => {
931 nested_block_comment_open_idxs.push(idx);
932 }
933 Some((_, '/')) if current_char == '*' => {
934 last_nested_block_comment_idxs =
935 nested_block_comment_open_idxs.pop().map(|open_idx| (open_idx, idx));
936 }
937 _ => {}
938 };
939 }
940
941 if let Some((nested_open_idx, nested_close_idx)) = last_nested_block_comment_idxs {
942 err.span_label(self.mk_sp(start, start + BytePos(2)), msg)
943 .span_label(
944 self.mk_sp(
945 start + BytePos(nested_open_idx as u32),
946 start + BytePos(nested_open_idx as u32 + 2),
947 ),
948 "...as last nested comment starts here, maybe you want to close this instead?",
949 )
950 .span_label(
951 self.mk_sp(
952 start + BytePos(nested_close_idx as u32),
953 start + BytePos(nested_close_idx as u32 + 2),
954 ),
955 "...and last nested comment terminates here.",
956 );
957 }
958
959 err.emit();
960 }
961
962 fn report_unknown_prefix(&self, start: BytePos) {
967 let prefix_span = self.mk_sp(start, self.pos);
968 let prefix = self.str_from_to(start, self.pos);
969 let expn_data = prefix_span.ctxt().outer_expn_data();
970
971 if expn_data.edition.at_least_rust_2021() {
972 let sugg = if prefix == "rb" {
974 Some(errors::UnknownPrefixSugg::UseBr(prefix_span))
975 } else if prefix == "rc" {
976 Some(errors::UnknownPrefixSugg::UseCr(prefix_span))
977 } else if expn_data.is_root() {
978 if self.cursor.first() == '\''
979 && let Some(start) = self.last_lifetime
980 && self.cursor.third() != '\''
981 && let end = self.mk_sp(self.pos, self.pos + BytePos(1))
982 && !self.psess.source_map().is_multiline(start.until(end))
983 {
984 Some(errors::UnknownPrefixSugg::MeantStr { start, end })
988 } else {
989 Some(errors::UnknownPrefixSugg::Whitespace(prefix_span.shrink_to_hi()))
990 }
991 } else {
992 None
993 };
994 self.dcx().emit_err(errors::UnknownPrefix { span: prefix_span, prefix, sugg });
995 } else {
996 self.psess.buffer_lint(
998 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
999 prefix_span,
1000 ast::CRATE_NODE_ID,
1001 BuiltinLintDiag::ReservedPrefix(prefix_span, prefix.to_string()),
1002 );
1003 }
1004 }
1005
1006 fn maybe_report_guarded_str(&mut self, start: BytePos, str_before: &'src str) -> TokenKind {
1013 let span = self.mk_sp(start, self.pos);
1014 let edition2024 = span.edition().at_least_rust_2024();
1015
1016 let space_pos = start + BytePos(1);
1017 let space_span = self.mk_sp(space_pos, space_pos);
1018
1019 let mut cursor = Cursor::new(str_before, FrontmatterAllowed::No);
1020
1021 let (is_string, span, unterminated) = match cursor.guarded_double_quoted_string() {
1022 Some(rustc_lexer::GuardedStr { n_hashes, terminated, token_len }) => {
1023 let end = start + BytePos(token_len);
1024 let span = self.mk_sp(start, end);
1025 let str_start = start + BytePos(n_hashes);
1026
1027 if edition2024 {
1028 self.cursor = cursor;
1029 self.pos = end;
1030 }
1031
1032 let unterminated = if terminated { None } else { Some(str_start) };
1033
1034 (true, span, unterminated)
1035 }
1036 None => {
1037 debug_assert_eq!(self.str_from_to(start, start + BytePos(2)), "##");
1039
1040 (false, span, None)
1041 }
1042 };
1043 if edition2024 {
1044 if let Some(str_start) = unterminated {
1045 self.dcx()
1047 .struct_span_fatal(
1048 self.mk_sp(str_start, self.pos),
1049 "unterminated double quote string",
1050 )
1051 .with_code(E0765)
1052 .emit()
1053 }
1054
1055 let sugg = if span.from_expansion() {
1056 None
1057 } else {
1058 Some(errors::GuardedStringSugg(space_span))
1059 };
1060
1061 let err = if is_string {
1063 self.dcx().emit_err(errors::ReservedString { span, sugg })
1064 } else {
1065 self.dcx().emit_err(errors::ReservedMultihash { span, sugg })
1066 };
1067
1068 token::Literal(token::Lit {
1069 kind: token::Err(err),
1070 symbol: self.symbol_from_to(start, self.pos),
1071 suffix: None,
1072 })
1073 } else {
1074 self.psess.buffer_lint(
1076 RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
1077 span,
1078 ast::CRATE_NODE_ID,
1079 BuiltinLintDiag::ReservedString { is_string, suggestion: space_span },
1080 );
1081
1082 self.pos = start + BytePos(1);
1085 self.cursor = Cursor::new(&str_before[1..], FrontmatterAllowed::No);
1086 token::Pound
1087 }
1088 }
1089
1090 fn report_too_many_hashes(&self, start: BytePos, num: u32) -> ! {
1091 self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
1092 }
1093
1094 fn cook_common(
1095 &self,
1096 mut kind: token::LitKind,
1097 mode: Mode,
1098 start: BytePos,
1099 end: BytePos,
1100 prefix_len: u32,
1101 postfix_len: u32,
1102 unescape: fn(&str, Mode, &mut dyn FnMut(Range<usize>, Result<(), EscapeError>)),
1103 ) -> (token::LitKind, Symbol) {
1104 let content_start = start + BytePos(prefix_len);
1105 let content_end = end - BytePos(postfix_len);
1106 let lit_content = self.str_from_to(content_start, content_end);
1107 unescape(lit_content, mode, &mut |range, result| {
1108 if let Err(err) = result {
1110 let span_with_quotes = self.mk_sp(start, end);
1111 let (start, end) = (range.start as u32, range.end as u32);
1112 let lo = content_start + BytePos(start);
1113 let hi = lo + BytePos(end - start);
1114 let span = self.mk_sp(lo, hi);
1115 let is_fatal = err.is_fatal();
1116 if let Some(guar) = emit_unescape_error(
1117 self.dcx(),
1118 lit_content,
1119 span_with_quotes,
1120 span,
1121 mode,
1122 range,
1123 err,
1124 ) {
1125 assert!(is_fatal);
1126 kind = token::Err(guar);
1127 }
1128 }
1129 });
1130
1131 let sym = if !matches!(kind, token::Err(_)) {
1134 Symbol::intern(lit_content)
1135 } else {
1136 self.symbol_from_to(start, end)
1137 };
1138 (kind, sym)
1139 }
1140
1141 fn cook_unicode(
1142 &self,
1143 kind: token::LitKind,
1144 mode: Mode,
1145 start: BytePos,
1146 end: BytePos,
1147 prefix_len: u32,
1148 postfix_len: u32,
1149 ) -> (token::LitKind, Symbol) {
1150 self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
1151 unescape_unicode(src, mode, &mut |span, result| callback(span, result.map(drop)))
1152 })
1153 }
1154
1155 fn cook_mixed(
1156 &self,
1157 kind: token::LitKind,
1158 mode: Mode,
1159 start: BytePos,
1160 end: BytePos,
1161 prefix_len: u32,
1162 postfix_len: u32,
1163 ) -> (token::LitKind, Symbol) {
1164 self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
1165 unescape_mixed(src, mode, &mut |span, result| callback(span, result.map(drop)))
1166 })
1167 }
1168}
1169
1170pub fn nfc_normalize(string: &str) -> Symbol {
1171 use unicode_normalization::{IsNormalized, UnicodeNormalization, is_nfc_quick};
1172 match is_nfc_quick(string.chars()) {
1173 IsNormalized::Yes => Symbol::intern(string),
1174 _ => {
1175 let normalized_str: String = string.chars().nfc().collect();
1176 Symbol::intern(&normalized_str)
1177 }
1178 }
1179}