Fix handling of repeated tuple indexing. (#4733)

Per [the design](https://docs.carbon-lang.dev/docs/design/lexical_conventions/), `x.1.2` should lex as `(x.1).2`, not as `x.(1.2)`.
carbon-language · Dec 21, 2024 · 28602a8 · 28602a8
1 parent 266fd6a
commit 28602a8
Show file tree

Hide file tree

Showing 10 changed files with 191 additions and 36 deletions.
diff --git a/toolchain/lex/lex.cpp b/toolchain/lex/lex.cpp
@@ -157,6 +157,11 @@ class [[clang::internal_linkage]] Lexer {
 
   auto LexComment(llvm::StringRef source_text, ssize_t& position) -> void;
 
+  // Determines whether a real literal can be formed at the current location.
+  // This is the case unless the preceding token is `.` or `->` and there is no
+  // intervening whitespace.
+  auto CanFormRealLiteral() -> bool;
+
   auto LexNumericLiteral(llvm::StringRef source_text, ssize_t& position)
       -> LexResult;
 
@@ -998,10 +1003,20 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
   current_line_info()->indent = position - line_start;
 }
 
+auto Lexer::CanFormRealLiteral() -> bool {
+  // When a numeric literal immediately follows a `.` or `->` token, with no
+  // intervening whitespace, a real literal is never formed.
+  if (has_leading_space_) {
+    return true;
+  }
+  auto kind = buffer_.GetKind(buffer_.tokens().end()[-1]);
+  return kind != TokenKind::Period && kind != TokenKind::MinusGreater;
+}
+
 auto Lexer::LexNumericLiteral(llvm::StringRef source_text, ssize_t& position)
     -> LexResult {
   std::optional<NumericLiteral> literal =
-      NumericLiteral::Lex(source_text.substr(position));
+      NumericLiteral::Lex(source_text.substr(position), CanFormRealLiteral());
   if (!literal) {
     return LexError(source_text, position);
   }

diff --git a/toolchain/lex/numeric_literal.cpp b/toolchain/lex/numeric_literal.cpp
@@ -15,7 +15,8 @@
 
 namespace Carbon::Lex {
 
-auto NumericLiteral::Lex(llvm::StringRef source_text)
+auto NumericLiteral::Lex(llvm::StringRef source_text,
+                         bool can_form_real_literal)
     -> std::optional<NumericLiteral> {
   NumericLiteral result;
 
@@ -46,8 +47,8 @@ auto NumericLiteral::Lex(llvm::StringRef source_text)
 
     // Exactly one `.` can be part of the literal, but only if it's followed by
     // an alphanumeric character.
-    if (c == '.' && i + 1 != n && IsAlnum(source_text[i + 1]) &&
-        !seen_radix_point) {
+    if (c == '.' && can_form_real_literal && i + 1 != n &&
+        IsAlnum(source_text[i + 1]) && !seen_radix_point) {
       result.radix_point_ = i;
       seen_radix_point = true;
       continue;

diff --git a/toolchain/lex/numeric_literal.h b/toolchain/lex/numeric_literal.h
@@ -42,7 +42,8 @@ class NumericLiteral {
   // Extract a numeric literal from the given text, if it has a suitable form.
   //
   // The supplied `source_text` must outlive the return value.
-  static auto Lex(llvm::StringRef source_text) -> std::optional<NumericLiteral>;
+  static auto Lex(llvm::StringRef source_text, bool can_form_real_literal)
+      -> std::optional<NumericLiteral>;
 
   // Compute the value of the token, if possible. Emit diagnostics to the given
   // emitter if the token is not valid.

diff --git a/toolchain/lex/numeric_literal_benchmark.cpp b/toolchain/lex/numeric_literal_benchmark.cpp
@@ -13,18 +13,18 @@ namespace {
 
 static void BM_Lex_Float(benchmark::State& state) {
   for (auto _ : state) {
-    CARBON_CHECK(NumericLiteral::Lex("0.000001"));
+    CARBON_CHECK(NumericLiteral::Lex("0.000001", true));
   }
 }
 
 static void BM_Lex_Int(benchmark::State& state) {
   for (auto _ : state) {
-    CARBON_CHECK(NumericLiteral::Lex("1_234_567_890"));
+    CARBON_CHECK(NumericLiteral::Lex("1_234_567_890", true));
   }
 }
 
 static void BM_ComputeValue_Float(benchmark::State& state) {
-  auto val = NumericLiteral::Lex("0.000001");
+  auto val = NumericLiteral::Lex("0.000001", true);
   CARBON_CHECK(val);
   auto emitter = NullDiagnosticEmitter<const char*>();
   for (auto _ : state) {
@@ -33,7 +33,7 @@ static void BM_ComputeValue_Float(benchmark::State& state) {
 }
 
 static void BM_ComputeValue_Int(benchmark::State& state) {
-  auto val = NumericLiteral::Lex("1_234_567_890");
+  auto val = NumericLiteral::Lex("1_234_567_890", true);
   auto emitter = NullDiagnosticEmitter<const char*>();
   CARBON_CHECK(val);
   for (auto _ : state) {

diff --git a/toolchain/lex/numeric_literal_fuzzer.cpp b/toolchain/lex/numeric_literal_fuzzer.cpp
@@ -14,7 +14,7 @@ namespace Carbon::Testing {
 // NOLINTNEXTLINE: Match the documented fuzzer entry point declaration style.
 extern "C" int LLVMFuzzerTestOneInput(const unsigned char* data, size_t size) {
   auto token = Lex::NumericLiteral::Lex(
-      llvm::StringRef(reinterpret_cast<const char*>(data), size));
+      llvm::StringRef(reinterpret_cast<const char*>(data), size), true);
   if (!token) {
     // Lexically not a numeric literal.
     return 0;

diff --git a/toolchain/lex/numeric_literal_test.cpp b/toolchain/lex/numeric_literal_test.cpp
@@ -25,17 +25,21 @@ class NumericLiteralTest : public ::testing::Test {
  public:
   NumericLiteralTest() : error_tracker(ConsoleDiagnosticConsumer()) {}
 
-  auto Lex(llvm::StringRef text) -> NumericLiteral {
-    std::optional<NumericLiteral> result = NumericLiteral::Lex(text);
+  auto Lex(llvm::StringRef text, bool can_form_real_literal) -> NumericLiteral {
+    std::optional<NumericLiteral> result =
+        NumericLiteral::Lex(text, can_form_real_literal);
     CARBON_CHECK(result);
-    EXPECT_EQ(result->text(), text);
+    if (can_form_real_literal) {
+      EXPECT_EQ(result->text(), text);
+    }
     return *result;
   }
 
-  auto Parse(llvm::StringRef text) -> NumericLiteral::Value {
+  auto Parse(llvm::StringRef text, bool can_form_real_literal = true)
+      -> NumericLiteral::Value {
     Testing::SingleTokenDiagnosticConverter converter(text);
     DiagnosticEmitter<const char*> emitter(converter, error_tracker);
-    return Lex(text).ComputeValue(emitter);
+    return Lex(text, can_form_real_literal).ComputeValue(emitter);
   }
 
   ErrorTrackingDiagnosticConsumer error_tracker;
@@ -91,12 +95,14 @@ TEST_F(NumericLiteralTest, HandlesIntLiteral) {
       {.token = "0b10_10_11", .value = 0b10'10'11, .radix = 2},
       {.token = "1_234_567", .value = 1'234'567, .radix = 10},
   };
-  for (Testcase testcase : testcases) {
-    error_tracker.Reset();
-    EXPECT_THAT(Parse(testcase.token),
-                HasIntValue(IsUnsignedInt(testcase.value)))
-        << testcase.token;
-    EXPECT_FALSE(error_tracker.seen_error()) << testcase.token;
+  for (bool can_form_real_literal : {false, true}) {
+    for (Testcase testcase : testcases) {
+      error_tracker.Reset();
+      EXPECT_THAT(Parse(testcase.token, can_form_real_literal),
+                  HasIntValue(IsUnsignedInt(testcase.value)))
+          << testcase.token;
+      EXPECT_FALSE(error_tracker.seen_error()) << testcase.token;
+    }
   }
 }
 
@@ -190,46 +196,85 @@ TEST_F(NumericLiteralTest, HandlesRealLiteral) {
     uint64_t mantissa;
     int64_t exponent;
     unsigned radix;
+    uint64_t int_value;
   };
   Testcase testcases[] = {
       // Decimal real literals.
-      {.token = "0.0", .mantissa = 0, .exponent = -1, .radix = 10},
-      {.token = "12.345", .mantissa = 12345, .exponent = -3, .radix = 10},
-      {.token = "12.345e6", .mantissa = 12345, .exponent = 3, .radix = 10},
-      {.token = "12.345e+6", .mantissa = 12345, .exponent = 3, .radix = 10},
-      {.token = "1_234.5e-2", .mantissa = 12345, .exponent = -3, .radix = 10},
+      {.token = "0.0",
+       .mantissa = 0,
+       .exponent = -1,
+       .radix = 10,
+       .int_value = 0},
+      {.token = "12.345",
+       .mantissa = 12345,
+       .exponent = -3,
+       .radix = 10,
+       .int_value = 12},
+      {.token = "12.345e6",
+       .mantissa = 12345,
+       .exponent = 3,
+       .radix = 10,
+       .int_value = 12},
+      {.token = "12.345e+6",
+       .mantissa = 12345,
+       .exponent = 3,
+       .radix = 10,
+       .int_value = 12},
+      {.token = "1_234.5e-2",
+       .mantissa = 12345,
+       .exponent = -3,
+       .radix = 10,
+       .int_value = 1234},
       {.token = "1.0e-2_000_000",
        .mantissa = 10,
        .exponent = -2'000'001,
-       .radix = 10},
+       .radix = 10,
+       .int_value = 1},
 
       // Hexadecimal real literals.
       {.token = "0x1_2345_6789.CDEF",
        .mantissa = 0x1'2345'6789'CDEF,
        .exponent = -16,
-       .radix = 16},
-      {.token = "0x0.0001p4", .mantissa = 1, .exponent = -12, .radix = 16},
-      {.token = "0x0.0001p+4", .mantissa = 1, .exponent = -12, .radix = 16},
-      {.token = "0x0.0001p-4", .mantissa = 1, .exponent = -20, .radix = 16},
+       .radix = 16,
+       .int_value = 0x1'2345'6789},
+      {.token = "0x0.0001p4",
+       .mantissa = 1,
+       .exponent = -12,
+       .radix = 16,
+       .int_value = 0},
+      {.token = "0x0.0001p+4",
+       .mantissa = 1,
+       .exponent = -12,
+       .radix = 16,
+       .int_value = 0},
+      {.token = "0x0.0001p-4",
+       .mantissa = 1,
+       .exponent = -20,
+       .radix = 16,
+       .int_value = 0},
       // The exponent here works out as exactly INT64_MIN.
       {.token = "0x1.01p-9223372036854775800",
        .mantissa = 0x101,
        .exponent = -9223372036854775807L - 1L,
-       .radix = 16},
+       .radix = 16,
+       .int_value = 1},
       // The exponent here doesn't fit in a signed 64-bit integer until we
       // adjust for the radix point.
       {.token = "0x1.01p9223372036854775809",
        .mantissa = 0x101,
        .exponent = 9223372036854775801L,
-       .radix = 16},
+       .radix = 16,
+       .int_value = 1},
 
       // Binary real literals. These are invalid, but we accept them for error
       // recovery.
       {.token = "0b10_11_01.01",
        .mantissa = 0b10110101,
        .exponent = -2,
-       .radix = 2},
+       .radix = 2,
+       .int_value = 0b101101},
   };
+  // Check we get the right real value.
   for (Testcase testcase : testcases) {
     error_tracker.Reset();
     EXPECT_THAT(Parse(testcase.token),
@@ -240,6 +285,15 @@ TEST_F(NumericLiteralTest, HandlesRealLiteral) {
     EXPECT_EQ(error_tracker.seen_error(), testcase.radix == 2)
         << testcase.token;
   }
+  // If we are required to stop at the `.` character, check we get the right int
+  // value instead.
+  for (Testcase testcase : testcases) {
+    error_tracker.Reset();
+    EXPECT_THAT(Parse(testcase.token, false),
+                HasIntValue(IsUnsignedInt(testcase.int_value)))
+        << testcase.token;
+    EXPECT_FALSE(error_tracker.seen_error());
+  }
 }
 
 TEST_F(NumericLiteralTest, HandlesRealLiteralOverflow) {

diff --git a/toolchain/lex/testdata/repeated_tuple_index.carbon b/toolchain/lex/testdata/repeated_tuple_index.carbon
@@ -0,0 +1,51 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// AUTOUPDATE
+// TIP: To test this file alone, run:
+// TIP:   bazel test //toolchain/testing:file_test --test_arg=--file_tests=toolchain/lex/testdata/repeated_tuple_index.carbon
+// TIP: To dump output, run:
+// TIP:   bazel run //toolchain/testing:file_test -- --dump_output --file_tests=toolchain/lex/testdata/repeated_tuple_index.carbon
+// CHECK:STDOUT: - filename: repeated_tuple_index.carbon
+// CHECK:STDOUT:   tokens:
+
+x.0.1.2.3.foo.4.5
+// CHECK:STDOUT:   - { index:  1, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: 'x', identifier: 0, has_leading_space: true }
+// CHECK:STDOUT:   - { index:  2, kind:        'Period', line: {{ *}}[[@LINE-2]], column:   2, indent: 1, spelling: '.' }
+// CHECK:STDOUT:   - { index:  3, kind:    'IntLiteral', line: {{ *}}[[@LINE-3]], column:   3, indent: 1, spelling: '0', value: `0` }
+// CHECK:STDOUT:   - { index:  4, kind:        'Period', line: {{ *}}[[@LINE-4]], column:   4, indent: 1, spelling: '.' }
+// CHECK:STDOUT:   - { index:  5, kind:    'IntLiteral', line: {{ *}}[[@LINE-5]], column:   5, indent: 1, spelling: '1', value: `1` }
+// CHECK:STDOUT:   - { index:  6, kind:        'Period', line: {{ *}}[[@LINE-6]], column:   6, indent: 1, spelling: '.' }
+// CHECK:STDOUT:   - { index:  7, kind:    'IntLiteral', line: {{ *}}[[@LINE-7]], column:   7, indent: 1, spelling: '2', value: `2` }
+// CHECK:STDOUT:   - { index:  8, kind:        'Period', line: {{ *}}[[@LINE-8]], column:   8, indent: 1, spelling: '.' }
+// CHECK:STDOUT:   - { index:  9, kind:    'IntLiteral', line: {{ *}}[[@LINE-9]], column:   9, indent: 1, spelling: '3', value: `3` }
+// CHECK:STDOUT:   - { index: 10, kind:        'Period', line: {{ *}}[[@LINE-10]], column:  10, indent: 1, spelling: '.' }
+// CHECK:STDOUT:   - { index: 11, kind:    'Identifier', line: {{ *}}[[@LINE-11]], column:  11, indent: 1, spelling: 'foo', identifier: 1 }
+// CHECK:STDOUT:   - { index: 12, kind:        'Period', line: {{ *}}[[@LINE-12]], column:  14, indent: 1, spelling: '.' }
+// CHECK:STDOUT:   - { index: 13, kind:    'IntLiteral', line: {{ *}}[[@LINE-13]], column:  15, indent: 1, spelling: '4', value: `4` }
+// CHECK:STDOUT:   - { index: 14, kind:        'Period', line: {{ *}}[[@LINE-14]], column:  16, indent: 1, spelling: '.' }
+// CHECK:STDOUT:   - { index: 15, kind:    'IntLiteral', line: {{ *}}[[@LINE-15]], column:  17, indent: 1, spelling: '5', value: `5` }
+
+1.2.3.4
+// CHECK:STDOUT:   - { index: 16, kind:   'RealLiteral', line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: '1.2', value: `12*10^-1`, has_leading_space: true }
+// CHECK:STDOUT:   - { index: 17, kind:        'Period', line: {{ *}}[[@LINE-2]], column:   4, indent: 1, spelling: '.' }
+// CHECK:STDOUT:   - { index: 18, kind:    'IntLiteral', line: {{ *}}[[@LINE-3]], column:   5, indent: 1, spelling: '3', value: `3` }
+// CHECK:STDOUT:   - { index: 19, kind:        'Period', line: {{ *}}[[@LINE-4]], column:   6, indent: 1, spelling: '.' }
+// CHECK:STDOUT:   - { index: 20, kind:    'IntLiteral', line: {{ *}}[[@LINE-5]], column:   7, indent: 1, spelling: '4', value: `4` }
+
+tuple.1.2
+// CHECK:STDOUT:   - { index: 21, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: 'tuple', identifier: 2, has_leading_space: true }
+// CHECK:STDOUT:   - { index: 22, kind:        'Period', line: {{ *}}[[@LINE-2]], column:   6, indent: 1, spelling: '.' }
+// CHECK:STDOUT:   - { index: 23, kind:    'IntLiteral', line: {{ *}}[[@LINE-3]], column:   7, indent: 1, spelling: '1', value: `1` }
+// CHECK:STDOUT:   - { index: 24, kind:        'Period', line: {{ *}}[[@LINE-4]], column:   8, indent: 1, spelling: '.' }
+// CHECK:STDOUT:   - { index: 25, kind:    'IntLiteral', line: {{ *}}[[@LINE-5]], column:   9, indent: 1, spelling: '2', value: `2` }
+
+tuple . 1.2
+// CHECK:STDOUT:   - { index: 26, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: 'tuple', identifier: 2, has_leading_space: true }
+// CHECK:STDOUT:   - { index: 27, kind:        'Period', line: {{ *}}[[@LINE-2]], column:   7, indent: 1, spelling: '.', has_leading_space: true }
+// CHECK:STDOUT:   - { index: 28, kind:   'RealLiteral', line: {{ *}}[[@LINE-3]], column:   9, indent: 1, spelling: '1.2', value: `12*10^-1`, has_leading_space: true }
+
+"hello"1.2
+// CHECK:STDOUT:   - { index: 29, kind: 'StringLiteral', line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: '"hello"', value: `hello`, has_leading_space: true }
+// CHECK:STDOUT:   - { index: 30, kind:   'RealLiteral', line: {{ *}}[[@LINE-2]], column:   8, indent: 1, spelling: '1.2', value: `12*10^-1` }
diff --git a/toolchain/lex/tokenized_buffer.cpp b/toolchain/lex/tokenized_buffer.cpp
@@ -70,7 +70,8 @@ auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
   if (token_info.kind() == TokenKind::IntLiteral ||
       token_info.kind() == TokenKind::RealLiteral) {
     std::optional<NumericLiteral> relexed_token =
-        NumericLiteral::Lex(source_->text().substr(token_info.byte_offset()));
+        NumericLiteral::Lex(source_->text().substr(token_info.byte_offset()),
+                            token_info.kind() == TokenKind::RealLiteral);
     CARBON_CHECK(relexed_token, "Could not reform numeric literal token.");
     return relexed_token->text();
   }

diff --git a/toolchain/lex/tokenized_buffer_test.cpp b/toolchain/lex/tokenized_buffer_test.cpp
@@ -337,7 +337,9 @@ TEST_F(LexerTest, SplitsNumericLiteralsProperly) {
                           // newline
                           {.kind = TokenKind::RealLiteral, .text = "9.0"},
                           {.kind = TokenKind::Period},
-                          {.kind = TokenKind::RealLiteral, .text = "9.5"},
+                          {.kind = TokenKind::IntLiteral, .text = "9"},
+                          {.kind = TokenKind::Period},
+                          {.kind = TokenKind::IntLiteral, .text = "5"},
                           // newline
                           {.kind = TokenKind::Error, .text = "10.foo"},
                           // newline

diff --git a/toolchain/parse/testdata/tuple/access/repeated.carbon b/toolchain/parse/testdata/tuple/access/repeated.carbon
@@ -0,0 +1,30 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// AUTOUPDATE
+// TIP: To test this file alone, run:
+// TIP:   bazel test //toolchain/testing:file_test --test_arg=--file_tests=toolchain/parse/testdata/tuple/access/repeated.carbon
+// TIP: To dump output, run:
+// TIP:   bazel run //toolchain/testing:file_test -- --dump_output --file_tests=toolchain/parse/testdata/tuple/access/repeated.carbon
+
+let x: i32 = tuple.0.1.2;
+
+// CHECK:STDOUT: - filename: repeated.carbon
+// CHECK:STDOUT:   parse_tree: [
+// CHECK:STDOUT:     {kind: 'FileStart', text: ''},
+// CHECK:STDOUT:       {kind: 'LetIntroducer', text: 'let'},
+// CHECK:STDOUT:         {kind: 'IdentifierName', text: 'x'},
+// CHECK:STDOUT:         {kind: 'IntTypeLiteral', text: 'i32'},
+// CHECK:STDOUT:       {kind: 'BindingPattern', text: ':', subtree_size: 3},
+// CHECK:STDOUT:       {kind: 'LetInitializer', text: '='},
+// CHECK:STDOUT:             {kind: 'IdentifierNameExpr', text: 'tuple'},
+// CHECK:STDOUT:             {kind: 'IntLiteral', text: '0'},
+// CHECK:STDOUT:           {kind: 'MemberAccessExpr', text: '.', subtree_size: 3},
+// CHECK:STDOUT:           {kind: 'IntLiteral', text: '1'},
+// CHECK:STDOUT:         {kind: 'MemberAccessExpr', text: '.', subtree_size: 5},
+// CHECK:STDOUT:         {kind: 'IntLiteral', text: '2'},
+// CHECK:STDOUT:       {kind: 'MemberAccessExpr', text: '.', subtree_size: 7},
+// CHECK:STDOUT:     {kind: 'LetDecl', text: ';', subtree_size: 13},
+// CHECK:STDOUT:     {kind: 'FileEnd', text: ''},
+// CHECK:STDOUT:   ]