1use diagnostics::make_errors_for_mismatched_closing_delims;
2use rustc_ast::ast::{self, AttrStyle};
3use rustc_ast::token::{self, CommentKind, Delimiter, IdentIsRaw, Token, TokenKind};
4use rustc_ast::tokenstream::TokenStream;
5use rustc_ast::util::unicode::{TEXT_FLOW_CONTROL_CHARS, contains_text_flow_control_chars};
6use rustc_errors::codes::*;
7use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey};
8use rustc_lexer::{
9 Base, Cursor, DocStyle, FrontmatterAllowed, LiteralKind, RawStrError, is_whitespace,
10};
11use rustc_literal_escaper::{EscapeError, Mode, check_for_errors};
12use rustc_session::lint::BuiltinLintDiag;
13use rustc_session::lint::builtin::{
14 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
15 TEXT_DIRECTION_CODEPOINT_IN_COMMENT, TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
16};
17use rustc_session::parse::ParseSess;
18use rustc_span::{BytePos, Pos, Span, Symbol, sym};
19use tracing::debug;
20
21use crate::errors;
22use crate::lexer::diagnostics::TokenTreeDiagInfo;
23use crate::lexer::unicode_chars::UNICODE_ARRAY;
24
25mod diagnostics;
26mod tokentrees;
27mod unescape_error_reporting;
28mod unicode_chars;
29
30use unescape_error_reporting::{emit_unescape_error, escaped_char};
31
32#[cfg(target_pointer_width = "64")]
37rustc_data_structures::static_assert_size!(rustc_lexer::Token, 12);
38
39#[derive(Clone, Debug)]
40pub(crate) struct UnmatchedDelim {
41 pub found_delim: Option<Delimiter>,
42 pub found_span: Span,
43 pub unclosed_span: Option<Span>,
44 pub candidate_span: Option<Span>,
45}
46
47pub(crate) fn lex_token_trees<'psess, 'src>(
48 psess: &'psess ParseSess,
49 mut src: &'src str,
50 mut start_pos: BytePos,
51 override_span: Option<Span>,
52 frontmatter_allowed: FrontmatterAllowed,
53) -> Result<TokenStream, Vec<Diag<'psess>>> {
54 if let Some(shebang_len) = rustc_lexer::strip_shebang(src) {
56 src = &src[shebang_len..];
57 start_pos = start_pos + BytePos::from_usize(shebang_len);
58 }
59
60 let cursor = Cursor::new(src, frontmatter_allowed);
61 let mut lexer = Lexer {
62 psess,
63 start_pos,
64 pos: start_pos,
65 src,
66 cursor,
67 override_span,
68 nbsp_is_whitespace: false,
69 last_lifetime: None,
70 token: Token::dummy(),
71 diag_info: TokenTreeDiagInfo::default(),
72 };
73 let res = lexer.lex_token_trees(false);
74
75 let mut unmatched_closing_delims: Vec<_> =
76 make_errors_for_mismatched_closing_delims(&lexer.diag_info.unmatched_delims, psess);
77
78 match res {
79 Ok((_open_spacing, stream)) => {
80 if unmatched_closing_delims.is_empty() {
81 Ok(stream)
82 } else {
83 Err(unmatched_closing_delims)
85 }
86 }
87 Err(errs) => {
88 unmatched_closing_delims.extend(errs);
91 Err(unmatched_closing_delims)
92 }
93 }
94}
95
96struct Lexer<'psess, 'src> {
97 psess: &'psess ParseSess,
98 start_pos: BytePos,
100 pos: BytePos,
102 src: &'src str,
104 cursor: Cursor<'src>,
106 override_span: Option<Span>,
107 nbsp_is_whitespace: bool,
111
112 last_lifetime: Option<Span>,
115
116 token: Token,
118
119 diag_info: TokenTreeDiagInfo,
120}
121
122impl<'psess, 'src> Lexer<'psess, 'src> {
123 fn dcx(&self) -> DiagCtxtHandle<'psess> {
124 self.psess.dcx()
125 }
126
127 fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
128 self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
129 }
130
131 fn next_token_from_cursor(&mut self) -> (Token, bool) {
134 let mut preceded_by_whitespace = false;
135 let mut swallow_next_invalid = 0;
136 loop {
138 let str_before = self.cursor.as_str();
139 let token = self.cursor.advance_token();
140 let start = self.pos;
141 self.pos = self.pos + BytePos(token.len);
142
143 debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
144
145 if let rustc_lexer::TokenKind::Semi
146 | rustc_lexer::TokenKind::LineComment { .. }
147 | rustc_lexer::TokenKind::BlockComment { .. }
148 | rustc_lexer::TokenKind::CloseParen
149 | rustc_lexer::TokenKind::CloseBrace
150 | rustc_lexer::TokenKind::CloseBracket = token.kind
151 {
152 self.last_lifetime = None;
155 }
156
157 let kind = match token.kind {
161 rustc_lexer::TokenKind::LineComment { doc_style } => {
162 let Some(doc_style) = doc_style else {
164 self.lint_unicode_text_flow(start);
165 preceded_by_whitespace = true;
166 continue;
167 };
168
169 let content_start = start + BytePos(3);
171 let content = self.str_from(content_start);
172 self.lint_doc_comment_unicode_text_flow(start, content);
173 self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
174 }
175 rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
176 if !terminated {
177 self.report_unterminated_block_comment(start, doc_style);
178 }
179
180 let Some(doc_style) = doc_style else {
182 self.lint_unicode_text_flow(start);
183 preceded_by_whitespace = true;
184 continue;
185 };
186
187 let content_start = start + BytePos(3);
190 let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
191 let content = self.str_from_to(content_start, content_end);
192 self.lint_doc_comment_unicode_text_flow(start, content);
193 self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
194 }
195 rustc_lexer::TokenKind::Frontmatter { has_invalid_preceding_whitespace, invalid_infostring } => {
196 self.validate_frontmatter(start, has_invalid_preceding_whitespace, invalid_infostring);
197 preceded_by_whitespace = true;
198 continue;
199 }
200 rustc_lexer::TokenKind::Whitespace => {
201 preceded_by_whitespace = true;
202 continue;
203 }
204 rustc_lexer::TokenKind::Ident => self.ident(start),
205 rustc_lexer::TokenKind::RawIdent => {
206 let sym = nfc_normalize(self.str_from(start + BytePos(2)));
207 let span = self.mk_sp(start, self.pos);
208 self.psess.symbol_gallery.insert(sym, span);
209 if !sym.can_be_raw() {
210 self.dcx().emit_err(errors::CannotBeRawIdent { span, ident: sym });
211 }
212 self.psess.raw_identifier_spans.push(span);
213 token::Ident(sym, IdentIsRaw::Yes)
214 }
215 rustc_lexer::TokenKind::UnknownPrefix => {
216 self.report_unknown_prefix(start);
217 self.ident(start)
218 }
219 rustc_lexer::TokenKind::UnknownPrefixLifetime => {
220 self.report_unknown_prefix(start);
221 let lifetime_name = self.str_from(start);
225 self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
226 let ident = Symbol::intern(lifetime_name);
227 token::Lifetime(ident, IdentIsRaw::No)
228 }
229 rustc_lexer::TokenKind::InvalidIdent
230 if !UNICODE_ARRAY.iter().any(|&(c, _, _)| {
233 let sym = self.str_from(start);
234 sym.chars().count() == 1 && c == sym.chars().next().unwrap()
235 }) =>
236 {
237 let sym = nfc_normalize(self.str_from(start));
238 let span = self.mk_sp(start, self.pos);
239 self.psess
240 .bad_unicode_identifiers
241 .borrow_mut()
242 .entry(sym)
243 .or_default()
244 .push(span);
245 token::Ident(sym, IdentIsRaw::No)
246 }
247 rustc_lexer::TokenKind::Literal {
250 kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
251 suffix_start: _,
252 } if !self.mk_sp(start, self.pos).edition().at_least_rust_2021() => {
253 let prefix_len = match kind {
254 LiteralKind::CStr { .. } => 1,
255 LiteralKind::RawCStr { .. } => 2,
256 _ => unreachable!(),
257 };
258
259 let lit_start = start + BytePos(prefix_len);
262 self.pos = lit_start;
263 self.cursor = Cursor::new(&str_before[prefix_len as usize..], FrontmatterAllowed::No);
264 self.report_unknown_prefix(start);
265 let prefix_span = self.mk_sp(start, lit_start);
266 return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
267 }
268 rustc_lexer::TokenKind::GuardedStrPrefix => {
269 self.maybe_report_guarded_str(start, str_before)
270 }
271 rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
272 let suffix_start = start + BytePos(suffix_start);
273 let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
274 let suffix = if suffix_start < self.pos {
275 let string = self.str_from(suffix_start);
276 if string == "_" {
277 self.dcx().emit_err(errors::UnderscoreLiteralSuffix {
278 span: self.mk_sp(suffix_start, self.pos),
279 });
280 None
281 } else {
282 Some(Symbol::intern(string))
283 }
284 } else {
285 None
286 };
287 self.lint_literal_unicode_text_flow(symbol, kind, self.mk_sp(start, self.pos), "literal");
288 token::Literal(token::Lit { kind, symbol, suffix })
289 }
290 rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
291 let lifetime_name = self.str_from(start);
295 self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
296 if starts_with_number {
297 let span = self.mk_sp(start, self.pos);
298 self.dcx()
299 .struct_err("lifetimes cannot start with a number")
300 .with_span(span)
301 .stash(span, StashKey::LifetimeIsChar);
302 }
303 let ident = Symbol::intern(lifetime_name);
304 token::Lifetime(ident, IdentIsRaw::No)
305 }
306 rustc_lexer::TokenKind::RawLifetime => {
307 self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
308
309 let ident_start = start + BytePos(3);
310 let prefix_span = self.mk_sp(start, ident_start);
311
312 if prefix_span.at_least_rust_2021() {
313 if self.cursor.as_str().starts_with('\'') {
319 let lit_span = self.mk_sp(start, self.pos + BytePos(1));
320 let contents = self.str_from_to(start + BytePos(1), self.pos);
321 emit_unescape_error(
322 self.dcx(),
323 contents,
324 lit_span,
325 lit_span,
326 Mode::Char,
327 0..contents.len(),
328 EscapeError::MoreThanOneChar,
329 )
330 .expect("expected error");
331 }
332
333 let span = self.mk_sp(start, self.pos);
334
335 let lifetime_name_without_tick =
336 Symbol::intern(&self.str_from(ident_start));
337 if !lifetime_name_without_tick.can_be_raw() {
338 self.dcx().emit_err(
339 errors::CannotBeRawLifetime {
340 span,
341 ident: lifetime_name_without_tick
342 }
343 );
344 }
345
346 let mut lifetime_name =
348 String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
349 lifetime_name.push('\'');
350 lifetime_name += lifetime_name_without_tick.as_str();
351 let sym = Symbol::intern(&lifetime_name);
352
353 self.psess.raw_identifier_spans.push(span);
355
356 token::Lifetime(sym, IdentIsRaw::Yes)
357 } else {
358 self.psess.buffer_lint(
360 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
361 prefix_span,
362 ast::CRATE_NODE_ID,
363 BuiltinLintDiag::RawPrefix(prefix_span),
364 );
365
366 let lt_start = start + BytePos(2);
368 self.pos = lt_start;
369 self.cursor = Cursor::new(&str_before[2 as usize..], FrontmatterAllowed::No);
370
371 let lifetime_name = self.str_from(start);
372 let ident = Symbol::intern(lifetime_name);
373 token::Lifetime(ident, IdentIsRaw::No)
374 }
375 }
376 rustc_lexer::TokenKind::Semi => token::Semi,
377 rustc_lexer::TokenKind::Comma => token::Comma,
378 rustc_lexer::TokenKind::Dot => token::Dot,
379 rustc_lexer::TokenKind::OpenParen => token::OpenParen,
380 rustc_lexer::TokenKind::CloseParen => token::CloseParen,
381 rustc_lexer::TokenKind::OpenBrace => token::OpenBrace,
382 rustc_lexer::TokenKind::CloseBrace => token::CloseBrace,
383 rustc_lexer::TokenKind::OpenBracket => token::OpenBracket,
384 rustc_lexer::TokenKind::CloseBracket => token::CloseBracket,
385 rustc_lexer::TokenKind::At => token::At,
386 rustc_lexer::TokenKind::Pound => token::Pound,
387 rustc_lexer::TokenKind::Tilde => token::Tilde,
388 rustc_lexer::TokenKind::Question => token::Question,
389 rustc_lexer::TokenKind::Colon => token::Colon,
390 rustc_lexer::TokenKind::Dollar => token::Dollar,
391 rustc_lexer::TokenKind::Eq => token::Eq,
392 rustc_lexer::TokenKind::Bang => token::Bang,
393 rustc_lexer::TokenKind::Lt => token::Lt,
394 rustc_lexer::TokenKind::Gt => token::Gt,
395 rustc_lexer::TokenKind::Minus => token::Minus,
396 rustc_lexer::TokenKind::And => token::And,
397 rustc_lexer::TokenKind::Or => token::Or,
398 rustc_lexer::TokenKind::Plus => token::Plus,
399 rustc_lexer::TokenKind::Star => token::Star,
400 rustc_lexer::TokenKind::Slash => token::Slash,
401 rustc_lexer::TokenKind::Caret => token::Caret,
402 rustc_lexer::TokenKind::Percent => token::Percent,
403
404 rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
405 if swallow_next_invalid > 0 {
407 swallow_next_invalid -= 1;
408 continue;
409 }
410 let mut it = self.str_from_to_end(start).chars();
411 let c = it.next().unwrap();
412 if c == '\u{00a0}' {
413 if self.nbsp_is_whitespace {
417 preceded_by_whitespace = true;
418 continue;
419 }
420 self.nbsp_is_whitespace = true;
421 }
422 let repeats = it.take_while(|c1| *c1 == c).count();
423 let (token, sugg) =
430 unicode_chars::check_for_substitution(self, start, c, repeats + 1);
431 self.dcx().emit_err(errors::UnknownTokenStart {
432 span: self.mk_sp(start, self.pos + Pos::from_usize(repeats * c.len_utf8())),
433 escaped: escaped_char(c),
434 sugg,
435 null: if c == '\x00' { Some(errors::UnknownTokenNull) } else { None },
436 repeat: if repeats > 0 {
437 swallow_next_invalid = repeats;
438 Some(errors::UnknownTokenRepeat { repeats })
439 } else {
440 None
441 },
442 });
443
444 if let Some(token) = token {
445 token
446 } else {
447 preceded_by_whitespace = true;
448 continue;
449 }
450 }
451 rustc_lexer::TokenKind::Eof => token::Eof,
452 };
453 let span = self.mk_sp(start, self.pos);
454 return (Token::new(kind, span), preceded_by_whitespace);
455 }
456 }
457
458 fn ident(&self, start: BytePos) -> TokenKind {
459 let sym = nfc_normalize(self.str_from(start));
460 let span = self.mk_sp(start, self.pos);
461 self.psess.symbol_gallery.insert(sym, span);
462 token::Ident(sym, IdentIsRaw::No)
463 }
464
465 fn lint_unicode_text_flow(&self, start: BytePos) {
468 let content_start = start + BytePos(2);
470 let content = self.str_from(content_start);
471 if contains_text_flow_control_chars(content) {
472 let span = self.mk_sp(start, self.pos);
473 self.psess.buffer_lint(
474 TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
475 span,
476 ast::CRATE_NODE_ID,
477 BuiltinLintDiag::UnicodeTextFlow(span, content.to_string()),
478 );
479 }
480 }
481
482 fn lint_doc_comment_unicode_text_flow(&mut self, start: BytePos, content: &str) {
483 if contains_text_flow_control_chars(content) {
484 self.report_text_direction_codepoint(
485 content,
486 self.mk_sp(start, self.pos),
487 0,
488 false,
489 "doc comment",
490 );
491 }
492 }
493
494 fn lint_literal_unicode_text_flow(
495 &mut self,
496 text: Symbol,
497 lit_kind: token::LitKind,
498 span: Span,
499 label: &'static str,
500 ) {
501 if !contains_text_flow_control_chars(text.as_str()) {
502 return;
503 }
504 let (padding, point_at_inner_spans) = match lit_kind {
505 token::LitKind::Str | token::LitKind::Char => (1, true),
507 token::LitKind::CStr => (2, true),
509 token::LitKind::StrRaw(n) => (n as u32 + 2, true),
511 token::LitKind::CStrRaw(n) => (n as u32 + 3, true),
513 token::LitKind::Err(_) => return,
515 _ => (0, false),
517 };
518 self.report_text_direction_codepoint(
519 text.as_str(),
520 span,
521 padding,
522 point_at_inner_spans,
523 label,
524 );
525 }
526
527 fn report_text_direction_codepoint(
528 &self,
529 text: &str,
530 span: Span,
531 padding: u32,
532 point_at_inner_spans: bool,
533 label: &str,
534 ) {
535 let spans: Vec<_> = text
537 .char_indices()
538 .filter_map(|(i, c)| {
539 TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
540 let lo = span.lo() + BytePos(i as u32 + padding);
541 (c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
542 })
543 })
544 .collect();
545
546 let label = label.to_string();
547 let count = spans.len();
548 let labels = point_at_inner_spans
549 .then_some(errors::HiddenUnicodeCodepointsDiagLabels { spans: spans.clone() });
550 let sub = if point_at_inner_spans && !spans.is_empty() {
551 errors::HiddenUnicodeCodepointsDiagSub::Escape { spans }
552 } else {
553 errors::HiddenUnicodeCodepointsDiagSub::NoEscape { spans }
554 };
555
556 self.psess.buffer_lint(
557 TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
558 span,
559 ast::CRATE_NODE_ID,
560 errors::HiddenUnicodeCodepointsDiag { label, count, span_label: span, labels, sub },
561 );
562 }
563
564 fn validate_frontmatter(
565 &self,
566 start: BytePos,
567 has_invalid_preceding_whitespace: bool,
568 invalid_infostring: bool,
569 ) {
570 let s = self.str_from(start);
571 let real_start = s.find("---").unwrap();
572 let frontmatter_opening_pos = BytePos(real_start as u32) + start;
573 let s_new = &s[real_start..];
574 let within = s_new.trim_start_matches('-');
575 let len_opening = s_new.len() - within.len();
576
577 let frontmatter_opening_end_pos = frontmatter_opening_pos + BytePos(len_opening as u32);
578 if has_invalid_preceding_whitespace {
579 let line_start =
580 BytePos(s[..real_start].rfind("\n").map_or(0, |i| i as u32 + 1)) + start;
581 let span = self.mk_sp(line_start, frontmatter_opening_end_pos);
582 let label_span = self.mk_sp(line_start, frontmatter_opening_pos);
583 self.dcx().emit_err(errors::FrontmatterInvalidOpeningPrecedingWhitespace {
584 span,
585 note_span: label_span,
586 });
587 }
588
589 if invalid_infostring {
590 let line_end = s[real_start..].find('\n').unwrap_or(s[real_start..].len());
591 let span = self.mk_sp(
592 frontmatter_opening_end_pos,
593 frontmatter_opening_pos + BytePos(line_end as u32),
594 );
595 self.dcx().emit_err(errors::FrontmatterInvalidInfostring { span });
596 }
597
598 let last_line_start = within.rfind('\n').map_or(0, |i| i + 1);
599 let last_line = &within[last_line_start..];
600 let last_line_trimmed = last_line.trim_start_matches(is_whitespace);
601 let last_line_start_pos = frontmatter_opening_end_pos + BytePos(last_line_start as u32);
602
603 let frontmatter_span = self.mk_sp(frontmatter_opening_pos, self.pos);
604 self.psess.gated_spans.gate(sym::frontmatter, frontmatter_span);
605
606 if !last_line_trimmed.starts_with("---") {
607 let label_span = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
608 self.dcx().emit_err(errors::FrontmatterUnclosed {
609 span: frontmatter_span,
610 note_span: label_span,
611 });
612 return;
613 }
614
615 if last_line_trimmed.len() != last_line.len() {
616 let line_end = last_line_start_pos + BytePos(last_line.len() as u32);
617 let span = self.mk_sp(last_line_start_pos, line_end);
618 let whitespace_end =
619 last_line_start_pos + BytePos((last_line.len() - last_line_trimmed.len()) as u32);
620 let label_span = self.mk_sp(last_line_start_pos, whitespace_end);
621 self.dcx().emit_err(errors::FrontmatterInvalidClosingPrecedingWhitespace {
622 span,
623 note_span: label_span,
624 });
625 }
626
627 let rest = last_line_trimmed.trim_start_matches('-');
628 let len_close = last_line_trimmed.len() - rest.len();
629 if len_close != len_opening {
630 let span = self.mk_sp(frontmatter_opening_pos, self.pos);
631 let opening = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
632 let last_line_close_pos = last_line_start_pos + BytePos(len_close as u32);
633 let close = self.mk_sp(last_line_start_pos, last_line_close_pos);
634 self.dcx().emit_err(errors::FrontmatterLengthMismatch {
635 span,
636 opening,
637 close,
638 len_opening,
639 len_close,
640 });
641 }
642
643 if !rest.trim_matches(is_whitespace).is_empty() {
644 let span = self.mk_sp(last_line_start_pos, self.pos);
645 self.dcx().emit_err(errors::FrontmatterExtraCharactersAfterClose { span });
646 }
647 }
648
649 fn cook_doc_comment(
650 &self,
651 content_start: BytePos,
652 content: &str,
653 comment_kind: CommentKind,
654 doc_style: DocStyle,
655 ) -> TokenKind {
656 if content.contains('\r') {
657 for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
658 let span = self.mk_sp(
659 content_start + BytePos(idx as u32),
660 content_start + BytePos(idx as u32 + 1),
661 );
662 let block = matches!(comment_kind, CommentKind::Block);
663 self.dcx().emit_err(errors::CrDocComment { span, block });
664 }
665 }
666
667 let attr_style = match doc_style {
668 DocStyle::Outer => AttrStyle::Outer,
669 DocStyle::Inner => AttrStyle::Inner,
670 };
671
672 token::DocComment(comment_kind, attr_style, Symbol::intern(content))
673 }
674
675 fn cook_lexer_literal(
676 &self,
677 start: BytePos,
678 end: BytePos,
679 kind: rustc_lexer::LiteralKind,
680 ) -> (token::LitKind, Symbol) {
681 match kind {
682 rustc_lexer::LiteralKind::Char { terminated } => {
683 if !terminated {
684 let mut err = self
685 .dcx()
686 .struct_span_fatal(self.mk_sp(start, end), "unterminated character literal")
687 .with_code(E0762);
688 if let Some(lt_sp) = self.last_lifetime {
689 err.multipart_suggestion(
690 "if you meant to write a string literal, use double quotes",
691 vec![
692 (lt_sp, "\"".to_string()),
693 (self.mk_sp(start, start + BytePos(1)), "\"".to_string()),
694 ],
695 Applicability::MaybeIncorrect,
696 );
697 }
698 err.emit()
699 }
700 self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) }
702 rustc_lexer::LiteralKind::Byte { terminated } => {
703 if !terminated {
704 self.dcx()
705 .struct_span_fatal(
706 self.mk_sp(start + BytePos(1), end),
707 "unterminated byte constant",
708 )
709 .with_code(E0763)
710 .emit()
711 }
712 self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) }
714 rustc_lexer::LiteralKind::Str { terminated } => {
715 if !terminated {
716 self.dcx()
717 .struct_span_fatal(
718 self.mk_sp(start, end),
719 "unterminated double quote string",
720 )
721 .with_code(E0765)
722 .emit()
723 }
724 self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) }
726 rustc_lexer::LiteralKind::ByteStr { terminated } => {
727 if !terminated {
728 self.dcx()
729 .struct_span_fatal(
730 self.mk_sp(start + BytePos(1), end),
731 "unterminated double quote byte string",
732 )
733 .with_code(E0766)
734 .emit()
735 }
736 self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1)
737 }
739 rustc_lexer::LiteralKind::CStr { terminated } => {
740 if !terminated {
741 self.dcx()
742 .struct_span_fatal(
743 self.mk_sp(start + BytePos(1), end),
744 "unterminated C string",
745 )
746 .with_code(E0767)
747 .emit()
748 }
749 self.cook_quoted(token::CStr, Mode::CStr, start, end, 2, 1) }
751 rustc_lexer::LiteralKind::RawStr { n_hashes } => {
752 if let Some(n_hashes) = n_hashes {
753 let n = u32::from(n_hashes);
754 let kind = token::StrRaw(n_hashes);
755 self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n)
756 } else {
758 self.report_raw_str_error(start, 1);
759 }
760 }
761 rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
762 if let Some(n_hashes) = n_hashes {
763 let n = u32::from(n_hashes);
764 let kind = token::ByteStrRaw(n_hashes);
765 self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n)
766 } else {
768 self.report_raw_str_error(start, 2);
769 }
770 }
771 rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
772 if let Some(n_hashes) = n_hashes {
773 let n = u32::from(n_hashes);
774 let kind = token::CStrRaw(n_hashes);
775 self.cook_quoted(kind, Mode::RawCStr, start, end, 3 + n, 1 + n)
776 } else {
778 self.report_raw_str_error(start, 2);
779 }
780 }
781 rustc_lexer::LiteralKind::Int { base, empty_int } => {
782 let mut kind = token::Integer;
783 if empty_int {
784 let span = self.mk_sp(start, end);
785 let guar = self.dcx().emit_err(errors::NoDigitsLiteral { span });
786 kind = token::Err(guar);
787 } else if matches!(base, Base::Binary | Base::Octal) {
788 let base = base as u32;
789 let s = self.str_from_to(start + BytePos(2), end);
790 for (idx, c) in s.char_indices() {
791 let span = self.mk_sp(
792 start + BytePos::from_usize(2 + idx),
793 start + BytePos::from_usize(2 + idx + c.len_utf8()),
794 );
795 if c != '_' && c.to_digit(base).is_none() {
796 let guar =
797 self.dcx().emit_err(errors::InvalidDigitLiteral { span, base });
798 kind = token::Err(guar);
799 }
800 }
801 }
802 (kind, self.symbol_from_to(start, end))
803 }
804 rustc_lexer::LiteralKind::Float { base, empty_exponent } => {
805 let mut kind = token::Float;
806 if empty_exponent {
807 let span = self.mk_sp(start, self.pos);
808 let guar = self.dcx().emit_err(errors::EmptyExponentFloat { span });
809 kind = token::Err(guar);
810 }
811 let base = match base {
812 Base::Hexadecimal => Some("hexadecimal"),
813 Base::Octal => Some("octal"),
814 Base::Binary => Some("binary"),
815 _ => None,
816 };
817 if let Some(base) = base {
818 let span = self.mk_sp(start, end);
819 let guar =
820 self.dcx().emit_err(errors::FloatLiteralUnsupportedBase { span, base });
821 kind = token::Err(guar)
822 }
823 (kind, self.symbol_from_to(start, end))
824 }
825 }
826 }
827
828 #[inline]
829 fn src_index(&self, pos: BytePos) -> usize {
830 (pos - self.start_pos).to_usize()
831 }
832
833 fn str_from(&self, start: BytePos) -> &'src str {
836 self.str_from_to(start, self.pos)
837 }
838
839 fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
841 debug!("taking an ident from {:?} to {:?}", start, end);
842 Symbol::intern(self.str_from_to(start, end))
843 }
844
845 fn str_from_to(&self, start: BytePos, end: BytePos) -> &'src str {
847 &self.src[self.src_index(start)..self.src_index(end)]
848 }
849
850 fn str_from_to_end(&self, start: BytePos) -> &'src str {
852 &self.src[self.src_index(start)..]
853 }
854
855 fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
856 match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
857 Err(RawStrError::InvalidStarter { bad_char }) => {
858 self.report_non_started_raw_string(start, bad_char)
859 }
860 Err(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self
861 .report_unterminated_raw_string(start, expected, possible_terminator_offset, found),
862 Err(RawStrError::TooManyDelimiters { found }) => {
863 self.report_too_many_hashes(start, found)
864 }
865 Ok(()) => panic!("no error found for supposedly invalid raw string literal"),
866 }
867 }
868
869 fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! {
870 self.dcx()
871 .struct_span_fatal(
872 self.mk_sp(start, self.pos),
873 format!(
874 "found invalid character; only `#` is allowed in raw string delimitation: {}",
875 escaped_char(bad_char)
876 ),
877 )
878 .emit()
879 }
880
881 fn report_unterminated_raw_string(
882 &self,
883 start: BytePos,
884 n_hashes: u32,
885 possible_offset: Option<u32>,
886 found_terminators: u32,
887 ) -> ! {
888 let mut err =
889 self.dcx().struct_span_fatal(self.mk_sp(start, start), "unterminated raw string");
890 err.code(E0748);
891 err.span_label(self.mk_sp(start, start), "unterminated raw string");
892
893 if n_hashes > 0 {
894 err.note(format!(
895 "this raw string should be terminated with `\"{}`",
896 "#".repeat(n_hashes as usize)
897 ));
898 }
899
900 if let Some(possible_offset) = possible_offset {
901 let lo = start + BytePos(possible_offset);
902 let hi = lo + BytePos(found_terminators);
903 let span = self.mk_sp(lo, hi);
904 err.span_suggestion(
905 span,
906 "consider terminating the string here",
907 "#".repeat(n_hashes as usize),
908 Applicability::MaybeIncorrect,
909 );
910 }
911
912 err.emit()
913 }
914
915 fn report_unterminated_block_comment(&self, start: BytePos, doc_style: Option<DocStyle>) {
916 let msg = match doc_style {
917 Some(_) => "unterminated block doc-comment",
918 None => "unterminated block comment",
919 };
920 let last_bpos = self.pos;
921 let mut err = self.dcx().struct_span_fatal(self.mk_sp(start, last_bpos), msg);
922 err.code(E0758);
923 let mut nested_block_comment_open_idxs = vec![];
924 let mut last_nested_block_comment_idxs = None;
925 let mut content_chars = self.str_from(start).char_indices().peekable();
926
927 while let Some((idx, current_char)) = content_chars.next() {
928 match content_chars.peek() {
929 Some((_, '*')) if current_char == '/' => {
930 nested_block_comment_open_idxs.push(idx);
931 }
932 Some((_, '/')) if current_char == '*' => {
933 last_nested_block_comment_idxs =
934 nested_block_comment_open_idxs.pop().map(|open_idx| (open_idx, idx));
935 }
936 _ => {}
937 };
938 }
939
940 if let Some((nested_open_idx, nested_close_idx)) = last_nested_block_comment_idxs {
941 err.span_label(self.mk_sp(start, start + BytePos(2)), msg)
942 .span_label(
943 self.mk_sp(
944 start + BytePos(nested_open_idx as u32),
945 start + BytePos(nested_open_idx as u32 + 2),
946 ),
947 "...as last nested comment starts here, maybe you want to close this instead?",
948 )
949 .span_label(
950 self.mk_sp(
951 start + BytePos(nested_close_idx as u32),
952 start + BytePos(nested_close_idx as u32 + 2),
953 ),
954 "...and last nested comment terminates here.",
955 );
956 }
957
958 err.emit();
959 }
960
961 fn report_unknown_prefix(&self, start: BytePos) {
966 let prefix_span = self.mk_sp(start, self.pos);
967 let prefix = self.str_from_to(start, self.pos);
968 let expn_data = prefix_span.ctxt().outer_expn_data();
969
970 if expn_data.edition.at_least_rust_2021() {
971 let sugg = if prefix == "rb" {
973 Some(errors::UnknownPrefixSugg::UseBr(prefix_span))
974 } else if prefix == "rc" {
975 Some(errors::UnknownPrefixSugg::UseCr(prefix_span))
976 } else if expn_data.is_root() {
977 if self.cursor.first() == '\''
978 && let Some(start) = self.last_lifetime
979 && self.cursor.third() != '\''
980 && let end = self.mk_sp(self.pos, self.pos + BytePos(1))
981 && !self.psess.source_map().is_multiline(start.until(end))
982 {
983 Some(errors::UnknownPrefixSugg::MeantStr { start, end })
987 } else {
988 Some(errors::UnknownPrefixSugg::Whitespace(prefix_span.shrink_to_hi()))
989 }
990 } else {
991 None
992 };
993 self.dcx().emit_err(errors::UnknownPrefix { span: prefix_span, prefix, sugg });
994 } else {
995 self.psess.buffer_lint(
997 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
998 prefix_span,
999 ast::CRATE_NODE_ID,
1000 BuiltinLintDiag::ReservedPrefix(prefix_span, prefix.to_string()),
1001 );
1002 }
1003 }
1004
1005 fn maybe_report_guarded_str(&mut self, start: BytePos, str_before: &'src str) -> TokenKind {
1012 let span = self.mk_sp(start, self.pos);
1013 let edition2024 = span.edition().at_least_rust_2024();
1014
1015 let space_pos = start + BytePos(1);
1016 let space_span = self.mk_sp(space_pos, space_pos);
1017
1018 let mut cursor = Cursor::new(str_before, FrontmatterAllowed::No);
1019
1020 let (is_string, span, unterminated) = match cursor.guarded_double_quoted_string() {
1021 Some(rustc_lexer::GuardedStr { n_hashes, terminated, token_len }) => {
1022 let end = start + BytePos(token_len);
1023 let span = self.mk_sp(start, end);
1024 let str_start = start + BytePos(n_hashes);
1025
1026 if edition2024 {
1027 self.cursor = cursor;
1028 self.pos = end;
1029 }
1030
1031 let unterminated = if terminated { None } else { Some(str_start) };
1032
1033 (true, span, unterminated)
1034 }
1035 None => {
1036 debug_assert_eq!(self.str_from_to(start, start + BytePos(2)), "##");
1038
1039 (false, span, None)
1040 }
1041 };
1042 if edition2024 {
1043 if let Some(str_start) = unterminated {
1044 self.dcx()
1046 .struct_span_fatal(
1047 self.mk_sp(str_start, self.pos),
1048 "unterminated double quote string",
1049 )
1050 .with_code(E0765)
1051 .emit()
1052 }
1053
1054 let sugg = if span.from_expansion() {
1055 None
1056 } else {
1057 Some(errors::GuardedStringSugg(space_span))
1058 };
1059
1060 let err = if is_string {
1062 self.dcx().emit_err(errors::ReservedString { span, sugg })
1063 } else {
1064 self.dcx().emit_err(errors::ReservedMultihash { span, sugg })
1065 };
1066
1067 token::Literal(token::Lit {
1068 kind: token::Err(err),
1069 symbol: self.symbol_from_to(start, self.pos),
1070 suffix: None,
1071 })
1072 } else {
1073 self.psess.buffer_lint(
1075 RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
1076 span,
1077 ast::CRATE_NODE_ID,
1078 BuiltinLintDiag::ReservedString { is_string, suggestion: space_span },
1079 );
1080
1081 self.pos = start + BytePos(1);
1084 self.cursor = Cursor::new(&str_before[1..], FrontmatterAllowed::No);
1085 token::Pound
1086 }
1087 }
1088
1089 fn report_too_many_hashes(&self, start: BytePos, num: u32) -> ! {
1090 self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
1091 }
1092
1093 fn cook_quoted(
1094 &self,
1095 mut kind: token::LitKind,
1096 mode: Mode,
1097 start: BytePos,
1098 end: BytePos,
1099 prefix_len: u32,
1100 postfix_len: u32,
1101 ) -> (token::LitKind, Symbol) {
1102 let content_start = start + BytePos(prefix_len);
1103 let content_end = end - BytePos(postfix_len);
1104 let lit_content = self.str_from_to(content_start, content_end);
1105 check_for_errors(lit_content, mode, |range, err| {
1106 let span_with_quotes = self.mk_sp(start, end);
1107 let (start, end) = (range.start as u32, range.end as u32);
1108 let lo = content_start + BytePos(start);
1109 let hi = lo + BytePos(end - start);
1110 let span = self.mk_sp(lo, hi);
1111 let is_fatal = err.is_fatal();
1112 if let Some(guar) = emit_unescape_error(
1113 self.dcx(),
1114 lit_content,
1115 span_with_quotes,
1116 span,
1117 mode,
1118 range,
1119 err,
1120 ) {
1121 assert!(is_fatal);
1122 kind = token::Err(guar);
1123 }
1124 });
1125
1126 let sym = if !matches!(kind, token::Err(_)) {
1129 Symbol::intern(lit_content)
1130 } else {
1131 self.symbol_from_to(start, end)
1132 };
1133 (kind, sym)
1134 }
1135}
1136
1137pub fn nfc_normalize(string: &str) -> Symbol {
1138 use unicode_normalization::{IsNormalized, UnicodeNormalization, is_nfc_quick};
1139 match is_nfc_quick(string.chars()) {
1140 IsNormalized::Yes => Symbol::intern(string),
1141 _ => {
1142 let normalized_str: String = string.chars().nfc().collect();
1143 Symbol::intern(&normalized_str)
1144 }
1145 }
1146}