Merge string literals that are adjacent.

Liareth · Liareth · commit 5a60906144e8 · 2021-04-18T15:29:53.000+01:00
This has the side-effect of removing one of our passes (since we do the names buffer inside the main loop now), improving performance by about 5%.
diff --git a/src/nwtrees/Lexer.cpp b/src/nwtrees/Lexer.cpp
@@ -484,20 +484,13 @@ LexerOutput nwtrees::lexer(const char* data, LexerOutput&& prev_output)
             std::stable_sort(matches, matches + match_count, &cmp);
         }
 
-        // -- Commit token.
-
         LexerMatch& selected_match = matches[0];
-        output.tokens.push_back(std::move(selected_match.token));
-
-        // -- Step stream forward, past the matched token length.
+        bool should_commit_match = true;
 
-        input.offset += selected_match.length;
-    }
+        // -- For tokens that need name buffers, prepare the buffer and update the name entry.
 
-    // -- For tokens that need name buffers, prepare the buffer and update the name entry.
+        Token& token = selected_match.token;
 
-    for (Token& token : output.tokens)
-    {
         const bool is_identifier = token.type == Token::Identifier;
         const bool is_str_literal = token.type == Token::Literal && token.literal == Literal::String;
 
@@ -509,11 +502,30 @@ LexerOutput nwtrees::lexer(const char* data, LexerOutput&& prev_output)
             std::memcpy(output.names.data() + new_idx, input.base + entry->idx, entry->len);
             entry->idx = (int)new_idx;
         }
-    }
 
-    // -- We will check for any string literals that are together and merge them into one token.
-    // This is quite easy: because they are next to each other, their contents are guaranteed to be next
-    // to each other in the buffer, so we just delete the ones at the end and increase the length of the first.
+        // -- If we're a string literal, we merge ourself with the previous token if it was also a string literal.
+
+        if (is_str_literal && !output.tokens.empty())
+        {
+            Token& last_token = output.tokens[output.tokens.size() - 1];
+            if (last_token.type == Token::Literal && last_token.literal == Literal::String)
+            {
+                last_token.literal_data.str.len += token.literal_data.str.len;
+                should_commit_match = false;
+            }
+        }
+
+        // -- Commit token.
+
+        if (should_commit_match)
+        {
+            output.tokens.push_back(token);
+        }
+
+        // -- Step stream forward, past the matched token length.
+
+        input.offset += selected_match.length;
+    }
 
     return output;
 }
diff --git a/tests/Lexer.cpp b/tests/Lexer.cpp
@@ -5,13 +5,13 @@
 namespace
 {
     template <typename T>
-    std::string concat(const T& collection)
+    std::string concat(const T& collection, const char separator = ' ')
     {
         std::string ret;
         for (const char* str : collection)
         {
             ret += str;
-            ret += ' ';
+            ret += separator;
         }
         return ret;
     }
@@ -99,11 +99,11 @@ TEST_CLASS(Lexer)
     TEST_METHOD(Literals_String)
     {
         static constexpr std::array literals { R"("test \" ")", R"("testnewline\n")" };
-        nwtrees::LexerOutput lex = nwtrees::lexer(concat(literals).c_str());
-        TEST_EXPECT(lex.tokens.size() == literals.size());
+        nwtrees::LexerOutput lex = nwtrees::lexer(concat(literals, ';').c_str());
+        TEST_EXPECT(lex.tokens.size() == literals.size() * 2);
         TEST_EXPECT(lex.errors.empty());
 
-        for (int i = 0; i < literals.size(); ++i)
+        for (int i = 0; i < literals.size(); i += 2)
         {
             const nwtrees::Token& token = lex.tokens[i];
             TEST_EXPECT(token.type == nwtrees::Token::Literal);
@@ -115,9 +115,8 @@ TEST_CLASS(Lexer)
 
     TEST_METHOD(Literals_String_Concat)
     {
-        return;
-        const char* input = R"("test" "test2" "test3")";
-        nwtrees::LexerOutput lex = nwtrees::lexer(input);
+        static constexpr std::array literals { R"("test")", R"("test2")", R"("test3")" };
+        nwtrees::LexerOutput lex = nwtrees::lexer(concat(literals).c_str());
         TEST_EXPECT(lex.tokens.size() == 1);
 
         const nwtrees::Token& token = lex.tokens[0];

Original file line number	Diff line number	Diff line change
`@@ -5,13 +5,13 @@`
`5`	`5`	`namespace`
`6`	`6`	`{`
`7`	`7`	`template <typename T>`
`8`		`- std::string concat(const T& collection)`
	`8`	`+ std::string concat(const T& collection, const char separator = ' ')`
`9`	`9`	`{`
`10`	`10`	`std::string ret;`
`11`	`11`	`for (const char* str : collection)`
`12`	`12`	`{`
`13`	`13`	`ret += str;`
`14`		`- ret += ' ';`
	`14`	`+ ret += separator;`
`15`	`15`	`}`
`16`	`16`	`return ret;`
`17`	`17`	`}`
`@@ -99,11 +99,11 @@ TEST_CLASS(Lexer)`
`99`	`99`	`TEST_METHOD(Literals_String)`
`100`	`100`	`{`
`101`	`101`	`static constexpr std::array literals { R"("test \" ")", R"("testnewline\n")" };`
`102`		`- nwtrees::LexerOutput lex = nwtrees::lexer(concat(literals).c_str());`
`103`		`- TEST_EXPECT(lex.tokens.size() == literals.size());`
	`102`	`+ nwtrees::LexerOutput lex = nwtrees::lexer(concat(literals, ';').c_str());`
	`103`	`+ TEST_EXPECT(lex.tokens.size() == literals.size() * 2);`
`104`	`104`	`TEST_EXPECT(lex.errors.empty());`
`105`	`105`
`106`		`- for (int i = 0; i < literals.size(); ++i)`
	`106`	`+ for (int i = 0; i < literals.size(); i += 2)`
`107`	`107`	`{`
`108`	`108`	`const nwtrees::Token& token = lex.tokens[i];`
`109`	`109`	`TEST_EXPECT(token.type == nwtrees::Token::Literal);`
`@@ -115,9 +115,8 @@ TEST_CLASS(Lexer)`
`115`	`115`
`116`	`116`	`TEST_METHOD(Literals_String_Concat)`
`117`	`117`	`{`
`118`		`- return;`
`119`		`- const char* input = R"("test" "test2" "test3")";`
`120`		`- nwtrees::LexerOutput lex = nwtrees::lexer(input);`
	`118`	`+ static constexpr std::array literals { R"("test")", R"("test2")", R"("test3")" };`
	`119`	`+ nwtrees::LexerOutput lex = nwtrees::lexer(concat(literals).c_str());`
`121`	`120`	`TEST_EXPECT(lex.tokens.size() == 1);`
`122`	`121`
`123`	`122`	`const nwtrees::Token& token = lex.tokens[0];`