Skip to content

Commit

Permalink
Fix handling of repeated tuple indexing. (#4733)
Browse files Browse the repository at this point in the history
Per [the
design](https://docs.carbon-lang.dev/docs/design/lexical_conventions/),
`x.1.2` should lex as `(x.1).2`, not as `x.(1.2)`.
  • Loading branch information
zygoloid authored Dec 21, 2024
1 parent 266fd6a commit 28602a8
Show file tree
Hide file tree
Showing 10 changed files with 191 additions and 36 deletions.
17 changes: 16 additions & 1 deletion toolchain/lex/lex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,11 @@ class [[clang::internal_linkage]] Lexer {

auto LexComment(llvm::StringRef source_text, ssize_t& position) -> void;

// Determines whether a real literal can be formed at the current location.
// This is the case unless the preceding token is `.` or `->` and there is no
// intervening whitespace.
auto CanFormRealLiteral() -> bool;

auto LexNumericLiteral(llvm::StringRef source_text, ssize_t& position)
-> LexResult;

Expand Down Expand Up @@ -998,10 +1003,20 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
current_line_info()->indent = position - line_start;
}

auto Lexer::CanFormRealLiteral() -> bool {
// When a numeric literal immediately follows a `.` or `->` token, with no
// intervening whitespace, a real literal is never formed.
if (has_leading_space_) {
return true;
}
auto kind = buffer_.GetKind(buffer_.tokens().end()[-1]);
return kind != TokenKind::Period && kind != TokenKind::MinusGreater;
}

auto Lexer::LexNumericLiteral(llvm::StringRef source_text, ssize_t& position)
-> LexResult {
std::optional<NumericLiteral> literal =
NumericLiteral::Lex(source_text.substr(position));
NumericLiteral::Lex(source_text.substr(position), CanFormRealLiteral());
if (!literal) {
return LexError(source_text, position);
}
Expand Down
7 changes: 4 additions & 3 deletions toolchain/lex/numeric_literal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@

namespace Carbon::Lex {

auto NumericLiteral::Lex(llvm::StringRef source_text)
auto NumericLiteral::Lex(llvm::StringRef source_text,
bool can_form_real_literal)
-> std::optional<NumericLiteral> {
NumericLiteral result;

Expand Down Expand Up @@ -46,8 +47,8 @@ auto NumericLiteral::Lex(llvm::StringRef source_text)

// Exactly one `.` can be part of the literal, but only if it's followed by
// an alphanumeric character.
if (c == '.' && i + 1 != n && IsAlnum(source_text[i + 1]) &&
!seen_radix_point) {
if (c == '.' && can_form_real_literal && i + 1 != n &&
IsAlnum(source_text[i + 1]) && !seen_radix_point) {
result.radix_point_ = i;
seen_radix_point = true;
continue;
Expand Down
3 changes: 2 additions & 1 deletion toolchain/lex/numeric_literal.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ class NumericLiteral {
// Extract a numeric literal from the given text, if it has a suitable form.
//
// The supplied `source_text` must outlive the return value.
static auto Lex(llvm::StringRef source_text) -> std::optional<NumericLiteral>;
static auto Lex(llvm::StringRef source_text, bool can_form_real_literal)
-> std::optional<NumericLiteral>;

// Compute the value of the token, if possible. Emit diagnostics to the given
// emitter if the token is not valid.
Expand Down
8 changes: 4 additions & 4 deletions toolchain/lex/numeric_literal_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@ namespace {

static void BM_Lex_Float(benchmark::State& state) {
for (auto _ : state) {
CARBON_CHECK(NumericLiteral::Lex("0.000001"));
CARBON_CHECK(NumericLiteral::Lex("0.000001", true));
}
}

static void BM_Lex_Int(benchmark::State& state) {
for (auto _ : state) {
CARBON_CHECK(NumericLiteral::Lex("1_234_567_890"));
CARBON_CHECK(NumericLiteral::Lex("1_234_567_890", true));
}
}

static void BM_ComputeValue_Float(benchmark::State& state) {
auto val = NumericLiteral::Lex("0.000001");
auto val = NumericLiteral::Lex("0.000001", true);
CARBON_CHECK(val);
auto emitter = NullDiagnosticEmitter<const char*>();
for (auto _ : state) {
Expand All @@ -33,7 +33,7 @@ static void BM_ComputeValue_Float(benchmark::State& state) {
}

static void BM_ComputeValue_Int(benchmark::State& state) {
auto val = NumericLiteral::Lex("1_234_567_890");
auto val = NumericLiteral::Lex("1_234_567_890", true);
auto emitter = NullDiagnosticEmitter<const char*>();
CARBON_CHECK(val);
for (auto _ : state) {
Expand Down
2 changes: 1 addition & 1 deletion toolchain/lex/numeric_literal_fuzzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ namespace Carbon::Testing {
// NOLINTNEXTLINE: Match the documented fuzzer entry point declaration style.
extern "C" int LLVMFuzzerTestOneInput(const unsigned char* data, size_t size) {
auto token = Lex::NumericLiteral::Lex(
llvm::StringRef(reinterpret_cast<const char*>(data), size));
llvm::StringRef(reinterpret_cast<const char*>(data), size), true);
if (!token) {
// Lexically not a numeric literal.
return 0;
Expand Down
102 changes: 78 additions & 24 deletions toolchain/lex/numeric_literal_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,21 @@ class NumericLiteralTest : public ::testing::Test {
public:
NumericLiteralTest() : error_tracker(ConsoleDiagnosticConsumer()) {}

auto Lex(llvm::StringRef text) -> NumericLiteral {
std::optional<NumericLiteral> result = NumericLiteral::Lex(text);
auto Lex(llvm::StringRef text, bool can_form_real_literal) -> NumericLiteral {
std::optional<NumericLiteral> result =
NumericLiteral::Lex(text, can_form_real_literal);
CARBON_CHECK(result);
EXPECT_EQ(result->text(), text);
if (can_form_real_literal) {
EXPECT_EQ(result->text(), text);
}
return *result;
}

auto Parse(llvm::StringRef text) -> NumericLiteral::Value {
auto Parse(llvm::StringRef text, bool can_form_real_literal = true)
-> NumericLiteral::Value {
Testing::SingleTokenDiagnosticConverter converter(text);
DiagnosticEmitter<const char*> emitter(converter, error_tracker);
return Lex(text).ComputeValue(emitter);
return Lex(text, can_form_real_literal).ComputeValue(emitter);
}

ErrorTrackingDiagnosticConsumer error_tracker;
Expand Down Expand Up @@ -91,12 +95,14 @@ TEST_F(NumericLiteralTest, HandlesIntLiteral) {
{.token = "0b10_10_11", .value = 0b10'10'11, .radix = 2},
{.token = "1_234_567", .value = 1'234'567, .radix = 10},
};
for (Testcase testcase : testcases) {
error_tracker.Reset();
EXPECT_THAT(Parse(testcase.token),
HasIntValue(IsUnsignedInt(testcase.value)))
<< testcase.token;
EXPECT_FALSE(error_tracker.seen_error()) << testcase.token;
for (bool can_form_real_literal : {false, true}) {
for (Testcase testcase : testcases) {
error_tracker.Reset();
EXPECT_THAT(Parse(testcase.token, can_form_real_literal),
HasIntValue(IsUnsignedInt(testcase.value)))
<< testcase.token;
EXPECT_FALSE(error_tracker.seen_error()) << testcase.token;
}
}
}

Expand Down Expand Up @@ -190,46 +196,85 @@ TEST_F(NumericLiteralTest, HandlesRealLiteral) {
uint64_t mantissa;
int64_t exponent;
unsigned radix;
uint64_t int_value;
};
Testcase testcases[] = {
// Decimal real literals.
{.token = "0.0", .mantissa = 0, .exponent = -1, .radix = 10},
{.token = "12.345", .mantissa = 12345, .exponent = -3, .radix = 10},
{.token = "12.345e6", .mantissa = 12345, .exponent = 3, .radix = 10},
{.token = "12.345e+6", .mantissa = 12345, .exponent = 3, .radix = 10},
{.token = "1_234.5e-2", .mantissa = 12345, .exponent = -3, .radix = 10},
{.token = "0.0",
.mantissa = 0,
.exponent = -1,
.radix = 10,
.int_value = 0},
{.token = "12.345",
.mantissa = 12345,
.exponent = -3,
.radix = 10,
.int_value = 12},
{.token = "12.345e6",
.mantissa = 12345,
.exponent = 3,
.radix = 10,
.int_value = 12},
{.token = "12.345e+6",
.mantissa = 12345,
.exponent = 3,
.radix = 10,
.int_value = 12},
{.token = "1_234.5e-2",
.mantissa = 12345,
.exponent = -3,
.radix = 10,
.int_value = 1234},
{.token = "1.0e-2_000_000",
.mantissa = 10,
.exponent = -2'000'001,
.radix = 10},
.radix = 10,
.int_value = 1},

// Hexadecimal real literals.
{.token = "0x1_2345_6789.CDEF",
.mantissa = 0x1'2345'6789'CDEF,
.exponent = -16,
.radix = 16},
{.token = "0x0.0001p4", .mantissa = 1, .exponent = -12, .radix = 16},
{.token = "0x0.0001p+4", .mantissa = 1, .exponent = -12, .radix = 16},
{.token = "0x0.0001p-4", .mantissa = 1, .exponent = -20, .radix = 16},
.radix = 16,
.int_value = 0x1'2345'6789},
{.token = "0x0.0001p4",
.mantissa = 1,
.exponent = -12,
.radix = 16,
.int_value = 0},
{.token = "0x0.0001p+4",
.mantissa = 1,
.exponent = -12,
.radix = 16,
.int_value = 0},
{.token = "0x0.0001p-4",
.mantissa = 1,
.exponent = -20,
.radix = 16,
.int_value = 0},
// The exponent here works out as exactly INT64_MIN.
{.token = "0x1.01p-9223372036854775800",
.mantissa = 0x101,
.exponent = -9223372036854775807L - 1L,
.radix = 16},
.radix = 16,
.int_value = 1},
// The exponent here doesn't fit in a signed 64-bit integer until we
// adjust for the radix point.
{.token = "0x1.01p9223372036854775809",
.mantissa = 0x101,
.exponent = 9223372036854775801L,
.radix = 16},
.radix = 16,
.int_value = 1},

// Binary real literals. These are invalid, but we accept them for error
// recovery.
{.token = "0b10_11_01.01",
.mantissa = 0b10110101,
.exponent = -2,
.radix = 2},
.radix = 2,
.int_value = 0b101101},
};
// Check we get the right real value.
for (Testcase testcase : testcases) {
error_tracker.Reset();
EXPECT_THAT(Parse(testcase.token),
Expand All @@ -240,6 +285,15 @@ TEST_F(NumericLiteralTest, HandlesRealLiteral) {
EXPECT_EQ(error_tracker.seen_error(), testcase.radix == 2)
<< testcase.token;
}
// If we are required to stop at the `.` character, check we get the right int
// value instead.
for (Testcase testcase : testcases) {
error_tracker.Reset();
EXPECT_THAT(Parse(testcase.token, false),
HasIntValue(IsUnsignedInt(testcase.int_value)))
<< testcase.token;
EXPECT_FALSE(error_tracker.seen_error());
}
}

TEST_F(NumericLiteralTest, HandlesRealLiteralOverflow) {
Expand Down
51 changes: 51 additions & 0 deletions toolchain/lex/testdata/repeated_tuple_index.carbon
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// AUTOUPDATE
// TIP: To test this file alone, run:
// TIP: bazel test //toolchain/testing:file_test --test_arg=--file_tests=toolchain/lex/testdata/repeated_tuple_index.carbon
// TIP: To dump output, run:
// TIP: bazel run //toolchain/testing:file_test -- --dump_output --file_tests=toolchain/lex/testdata/repeated_tuple_index.carbon
// CHECK:STDOUT: - filename: repeated_tuple_index.carbon
// CHECK:STDOUT: tokens:

x.0.1.2.3.foo.4.5
// CHECK:STDOUT: - { index: 1, kind: 'Identifier', line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: 'x', identifier: 0, has_leading_space: true }
// CHECK:STDOUT: - { index: 2, kind: 'Period', line: {{ *}}[[@LINE-2]], column: 2, indent: 1, spelling: '.' }
// CHECK:STDOUT: - { index: 3, kind: 'IntLiteral', line: {{ *}}[[@LINE-3]], column: 3, indent: 1, spelling: '0', value: `0` }
// CHECK:STDOUT: - { index: 4, kind: 'Period', line: {{ *}}[[@LINE-4]], column: 4, indent: 1, spelling: '.' }
// CHECK:STDOUT: - { index: 5, kind: 'IntLiteral', line: {{ *}}[[@LINE-5]], column: 5, indent: 1, spelling: '1', value: `1` }
// CHECK:STDOUT: - { index: 6, kind: 'Period', line: {{ *}}[[@LINE-6]], column: 6, indent: 1, spelling: '.' }
// CHECK:STDOUT: - { index: 7, kind: 'IntLiteral', line: {{ *}}[[@LINE-7]], column: 7, indent: 1, spelling: '2', value: `2` }
// CHECK:STDOUT: - { index: 8, kind: 'Period', line: {{ *}}[[@LINE-8]], column: 8, indent: 1, spelling: '.' }
// CHECK:STDOUT: - { index: 9, kind: 'IntLiteral', line: {{ *}}[[@LINE-9]], column: 9, indent: 1, spelling: '3', value: `3` }
// CHECK:STDOUT: - { index: 10, kind: 'Period', line: {{ *}}[[@LINE-10]], column: 10, indent: 1, spelling: '.' }
// CHECK:STDOUT: - { index: 11, kind: 'Identifier', line: {{ *}}[[@LINE-11]], column: 11, indent: 1, spelling: 'foo', identifier: 1 }
// CHECK:STDOUT: - { index: 12, kind: 'Period', line: {{ *}}[[@LINE-12]], column: 14, indent: 1, spelling: '.' }
// CHECK:STDOUT: - { index: 13, kind: 'IntLiteral', line: {{ *}}[[@LINE-13]], column: 15, indent: 1, spelling: '4', value: `4` }
// CHECK:STDOUT: - { index: 14, kind: 'Period', line: {{ *}}[[@LINE-14]], column: 16, indent: 1, spelling: '.' }
// CHECK:STDOUT: - { index: 15, kind: 'IntLiteral', line: {{ *}}[[@LINE-15]], column: 17, indent: 1, spelling: '5', value: `5` }

1.2.3.4
// CHECK:STDOUT: - { index: 16, kind: 'RealLiteral', line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: '1.2', value: `12*10^-1`, has_leading_space: true }
// CHECK:STDOUT: - { index: 17, kind: 'Period', line: {{ *}}[[@LINE-2]], column: 4, indent: 1, spelling: '.' }
// CHECK:STDOUT: - { index: 18, kind: 'IntLiteral', line: {{ *}}[[@LINE-3]], column: 5, indent: 1, spelling: '3', value: `3` }
// CHECK:STDOUT: - { index: 19, kind: 'Period', line: {{ *}}[[@LINE-4]], column: 6, indent: 1, spelling: '.' }
// CHECK:STDOUT: - { index: 20, kind: 'IntLiteral', line: {{ *}}[[@LINE-5]], column: 7, indent: 1, spelling: '4', value: `4` }

tuple.1.2
// CHECK:STDOUT: - { index: 21, kind: 'Identifier', line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: 'tuple', identifier: 2, has_leading_space: true }
// CHECK:STDOUT: - { index: 22, kind: 'Period', line: {{ *}}[[@LINE-2]], column: 6, indent: 1, spelling: '.' }
// CHECK:STDOUT: - { index: 23, kind: 'IntLiteral', line: {{ *}}[[@LINE-3]], column: 7, indent: 1, spelling: '1', value: `1` }
// CHECK:STDOUT: - { index: 24, kind: 'Period', line: {{ *}}[[@LINE-4]], column: 8, indent: 1, spelling: '.' }
// CHECK:STDOUT: - { index: 25, kind: 'IntLiteral', line: {{ *}}[[@LINE-5]], column: 9, indent: 1, spelling: '2', value: `2` }

tuple . 1.2
// CHECK:STDOUT: - { index: 26, kind: 'Identifier', line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: 'tuple', identifier: 2, has_leading_space: true }
// CHECK:STDOUT: - { index: 27, kind: 'Period', line: {{ *}}[[@LINE-2]], column: 7, indent: 1, spelling: '.', has_leading_space: true }
// CHECK:STDOUT: - { index: 28, kind: 'RealLiteral', line: {{ *}}[[@LINE-3]], column: 9, indent: 1, spelling: '1.2', value: `12*10^-1`, has_leading_space: true }

"hello"1.2
// CHECK:STDOUT: - { index: 29, kind: 'StringLiteral', line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: '"hello"', value: `hello`, has_leading_space: true }
// CHECK:STDOUT: - { index: 30, kind: 'RealLiteral', line: {{ *}}[[@LINE-2]], column: 8, indent: 1, spelling: '1.2', value: `12*10^-1` }
3 changes: 2 additions & 1 deletion toolchain/lex/tokenized_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
if (token_info.kind() == TokenKind::IntLiteral ||
token_info.kind() == TokenKind::RealLiteral) {
std::optional<NumericLiteral> relexed_token =
NumericLiteral::Lex(source_->text().substr(token_info.byte_offset()));
NumericLiteral::Lex(source_->text().substr(token_info.byte_offset()),
token_info.kind() == TokenKind::RealLiteral);
CARBON_CHECK(relexed_token, "Could not reform numeric literal token.");
return relexed_token->text();
}
Expand Down
4 changes: 3 additions & 1 deletion toolchain/lex/tokenized_buffer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,9 @@ TEST_F(LexerTest, SplitsNumericLiteralsProperly) {
// newline
{.kind = TokenKind::RealLiteral, .text = "9.0"},
{.kind = TokenKind::Period},
{.kind = TokenKind::RealLiteral, .text = "9.5"},
{.kind = TokenKind::IntLiteral, .text = "9"},
{.kind = TokenKind::Period},
{.kind = TokenKind::IntLiteral, .text = "5"},
// newline
{.kind = TokenKind::Error, .text = "10.foo"},
// newline
Expand Down
30 changes: 30 additions & 0 deletions toolchain/parse/testdata/tuple/access/repeated.carbon
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// AUTOUPDATE
// TIP: To test this file alone, run:
// TIP: bazel test //toolchain/testing:file_test --test_arg=--file_tests=toolchain/parse/testdata/tuple/access/repeated.carbon
// TIP: To dump output, run:
// TIP: bazel run //toolchain/testing:file_test -- --dump_output --file_tests=toolchain/parse/testdata/tuple/access/repeated.carbon

let x: i32 = tuple.0.1.2;

// CHECK:STDOUT: - filename: repeated.carbon
// CHECK:STDOUT: parse_tree: [
// CHECK:STDOUT: {kind: 'FileStart', text: ''},
// CHECK:STDOUT: {kind: 'LetIntroducer', text: 'let'},
// CHECK:STDOUT: {kind: 'IdentifierName', text: 'x'},
// CHECK:STDOUT: {kind: 'IntTypeLiteral', text: 'i32'},
// CHECK:STDOUT: {kind: 'BindingPattern', text: ':', subtree_size: 3},
// CHECK:STDOUT: {kind: 'LetInitializer', text: '='},
// CHECK:STDOUT: {kind: 'IdentifierNameExpr', text: 'tuple'},
// CHECK:STDOUT: {kind: 'IntLiteral', text: '0'},
// CHECK:STDOUT: {kind: 'MemberAccessExpr', text: '.', subtree_size: 3},
// CHECK:STDOUT: {kind: 'IntLiteral', text: '1'},
// CHECK:STDOUT: {kind: 'MemberAccessExpr', text: '.', subtree_size: 5},
// CHECK:STDOUT: {kind: 'IntLiteral', text: '2'},
// CHECK:STDOUT: {kind: 'MemberAccessExpr', text: '.', subtree_size: 7},
// CHECK:STDOUT: {kind: 'LetDecl', text: ';', subtree_size: 13},
// CHECK:STDOUT: {kind: 'FileEnd', text: ''},
// CHECK:STDOUT: ]

0 comments on commit 28602a8

Please sign in to comment.