Increase test-coverage of lexer

It turns out that an unterminated string caused an infinite loop, so that has been fixed along with a test-case or two to confirm that. Fixed up some other things so now we have 100% test-coverage of this package, albeit some of it is fake.
skx · Nov 21, 2023 · 9d572c7 · 9d572c7
1 parent 77f5afc
commit 9d572c7
Show file tree

Hide file tree

Showing 2 changed files with 118 additions and 28 deletions.
diff --git a/lexer/lexer.go b/lexer/lexer.go
@@ -3,6 +3,7 @@
 package lexer
 
 import (
+	"errors"
 	"fmt"
 	"strings"
 	"unicode"
@@ -237,11 +238,23 @@ func (l *Lexer) NextToken() token.Token {
 			}
 		}
 	case rune('"'):
-		tok.Type = token.STRING
-		tok.Literal = l.readString()
+		str, err := l.readString('"')
+		if err == nil {
+			tok.Literal = str
+			tok.Type = token.STRING
+		} else {
+			tok.Literal = err.Error()
+			tok.Type = token.ILLEGAL
+		}
 	case rune('`'):
-		tok.Type = token.BACKTICK
-		tok.Literal = l.readBacktick()
+		str, err := l.readString('`')
+		if err == nil {
+			tok.Literal = str
+			tok.Type = token.BACKTICK
+		} else {
+			tok.Literal = err.Error()
+			tok.Type = token.ILLEGAL
+		}
 	case rune('['):
 		tok = newToken(token.LBRACKET, l.ch)
 	case rune(']'):
@@ -284,16 +297,16 @@ func newToken(tokenType token.Type, ch rune) token.Token {
 //
 // So with input like this:
 //
-//   a.blah();
+//	a.blah();
 //
 // Our identifier should be "a" (then we have a period, then a second
 // identifier "blah", followed by opening & closing parenthesis).
 //
 // However we also have to cover the case of:
 //
-//    string.toupper( "blah" );
-//    os.getenv( "PATH" );
-//    ..
+//	string.toupper( "blah" );
+//	os.getenv( "PATH" );
+//	..
 //
 // So we have a horrid implementation..
 func (l *Lexer) readIdentifier() string {
@@ -488,22 +501,36 @@ func (l *Lexer) readDecimal() token.Token {
 	return token.Token{Type: token.INT, Literal: integer}
 }
 
-// read string
-func (l *Lexer) readString() string {
+// read a string, deliminated by the given character.
+func (l *Lexer) readString(delim rune) (string, error) {
 	out := ""
 
 	for {
 		l.readChar()
-		if l.ch == '"' {
+
+		if l.ch == rune(0) {
+			return "", fmt.Errorf("unterminated string")
+		}
+		if l.ch == delim {
 			break
 		}
-
 		//
 		// Handle \n, \r, \t, \", etc.
 		//
 		if l.ch == '\\' {
+
+			// Line ending with "\" + newline
+			if l.peekChar() == '\n' {
+				// consume the newline.
+				l.readChar()
+				continue
+			}
+
 			l.readChar()
 
+			if l.ch == rune(0) {
+				return "", errors.New("unterminated string")
+			}
 			if l.ch == rune('n') {
 				l.ch = '\n'
 			}
@@ -521,9 +548,10 @@ func (l *Lexer) readString() string {
 			}
 		}
 		out = out + string(l.ch)
+
 	}
 
-	return out
+	return out, nil
 }
 
 // read a regexp, including flags.
@@ -576,19 +604,6 @@ func (l *Lexer) readRegexp() (string, error) {
 	return out, nil
 }
 
-// read the end of a backtick-quoted string
-func (l *Lexer) readBacktick() string {
-	position := l.position + 1
-	for {
-		l.readChar()
-		if l.ch == '`' {
-			break
-		}
-	}
-	out := string(l.characters[position:l.position])
-	return out
-}
-
 // peek character
 func (l *Lexer) peekChar() rune {
 	if l.readPosition >= len(l.characters) {

diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go
@@ -244,15 +244,17 @@ func TestString(t *testing.T) {
 			t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal)
 		}
 	}
-
 }
+
 func TestSimpleComment(t *testing.T) {
 	input := `=+// This is a comment
 // This is still a comment
 # I like comments
 let a = 1; # This is a comment too.
 // This is a final
-// comment on two-lines`
+// comment on two-lines
+/*
+`
 
 	tests := []struct {
 		expectedType    token.Type
@@ -673,6 +675,11 @@ a = 3/4;
 			t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal)
 		}
 	}
+
+	x := l.GetLine()
+	if x != 2 {
+		t.Fatalf("unexpected line. %d", x)
+	}
 }
 
 // TestDotDot is designed to ensure we get a ".." not an integer value.
@@ -702,3 +709,71 @@ func TestDotDot(t *testing.T) {
 		}
 	}
 }
+
+// TestIllegalString is designed to look for an unterminated/illegal string
+func TestIllegalString(t *testing.T) {
+
+	// Illegal strings
+	bad := []string{
+		`if ( f ~= "steve\
+ )`,
+		`if ( f ~= "steve\`,
+	}
+
+	for _, input := range bad {
+
+		tests := []struct {
+			expectedType    token.Type
+			expectedLiteral string
+		}{
+			{token.IF, "if"},
+			{token.LPAREN, "("},
+			{token.IDENT, "f"},
+			{token.CONTAINS, "~="},
+			{token.ILLEGAL, "unterminated string"},
+			{token.EOF, ""},
+		}
+		l := New(input)
+		for i, tt := range tests {
+			tok := l.NextToken()
+			if tok.Type != tt.expectedType {
+				t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type)
+			}
+			if tok.Literal != tt.expectedLiteral {
+				t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal)
+			}
+		}
+	}
+}
+
+// TestIllegalString is designed to look for an unterminated/illegal backtick
+func TestIllegalBacktick(t *testing.T) {
+	input := "if ( f ~= `steve )"
+
+	tests := []struct {
+		expectedType    token.Type
+		expectedLiteral string
+	}{
+		{token.IF, "if"},
+		{token.LPAREN, "("},
+		{token.IDENT, "f"},
+		{token.CONTAINS, "~="},
+		{token.ILLEGAL, "unterminated string"},
+		{token.EOF, ""},
+	}
+	l := New(input)
+	for i, tt := range tests {
+		tok := l.NextToken()
+		if tok.Type != tt.expectedType {
+			t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type)
+		}
+		if tok.Literal != tt.expectedLiteral {
+			t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal)
+		}
+	}
+
+	x := l.GetLine()
+	if x != 0 {
+		t.Fatalf("unexpected line. %d", x)
+	}
+}